From 8680bf8c4213551a8f3ffa47ae81589b63ff2ebc Mon Sep 17 00:00:00 2001 From: AlongWY Date: Sun, 3 Sep 2023 05:20:22 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 57366 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 57761 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..4cf91ab7 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-08-28T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2209.06767v3","updated":"2023-08-28T17:59:49Z","published":"2022-09-14T16:45:13Z","title":"Parameter-Efficient Finetuning for Robust Continual Multilingual\n Learning","summary":" We introduce and study the problem of Continual Multilingual Learning (CML)\nwhere a previously trained multilingual model is periodically updated using new\ndata arriving in stages. If the new data is present only in a subset of\nlanguages, we find that the resulting model shows improved performance only on\nthe languages included in the latest update (and a few closely related\nlanguages) while its performance on all the remaining languages degrade\nsignificantly. We address this challenge by proposing LAFT-URIEL, a\nparameter-efficient finetuning strategy which aims to increase the number of\nlanguages on which the model improves after an update, while reducing the\nmagnitude of loss in performance for the remaining languages. LAFT-URIEL uses\nlinguistic knowledge to balance overfitting and knowledge sharing across\nlanguages, allowing for an additional 25% of task languages to see an\nimprovement in performance after an update, while also reducing the average\nmagnitude of losses on the remaining languages by 78% relative.\n","authors":["Kartikeya Badola","Shachi Dave","Partha Talukdar"],"pdf_url":"https://arxiv.org/pdf/2209.06767v3.pdf","comment":"Published at ACL Findings 2023"},{"id":"http://arxiv.org/abs/2308.13506v2","updated":"2023-08-28T17:46:59Z","published":"2023-08-25T17:31:46Z","title":"Training and Meta-Evaluating Machine Translation Evaluation Metrics at\n the Paragraph Level","summary":" As research on machine translation moves to translating text beyond the\nsentence level, it remains unclear how effective automatic evaluation metrics\nare at scoring longer translations. In this work, we first propose a method for\ncreating paragraph-level data for training and meta-evaluating metrics from\nexisting sentence-level data. Then, we use these new datasets to benchmark\nexisting sentence-level metrics as well as train learned metrics at the\nparagraph level. Interestingly, our experimental results demonstrate that using\nsentence-level metrics to score entire paragraphs is equally as effective as\nusing a metric designed to work at the paragraph level. We speculate this\nresult can be attributed to properties of the task of reference-based\nevaluation as well as limitations of our datasets with respect to capturing all\ntypes of phenomena that occur in paragraph-level translations.\n","authors":["Daniel Deutsch","Juraj Juraska","Mara Finkelstein","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2308.13506v2.pdf","comment":"Removing extra \"and\" from author list"},{"id":"http://arxiv.org/abs/2308.14683v1","updated":"2023-08-28T16:18:50Z","published":"2023-08-28T16:18:50Z","title":"Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual\n Predatory Chats and Abusive Texts","summary":" Detecting online sexual predatory behaviours and abusive language on social\nmedia platforms has become a critical area of research due to the growing\nconcerns about online safety, especially for vulnerable populations such as\nchildren and adolescents. Researchers have been exploring various techniques\nand approaches to develop effective detection systems that can identify and\nmitigate these risks. Recent development of large language models (LLMs) has\nopened a new opportunity to address this problem more effectively. This paper\nproposes an approach to detection of online sexual predatory chats and abusive\nlanguage using the open-source pretrained Llama 2 7B-parameter model, recently\nreleased by Meta GenAI. We fine-tune the LLM using datasets with different\nsizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu).\nBased on the power of LLMs, our approach is generic and automated without a\nmanual search for a synergy between feature extraction and classifier design\nsteps like conventional methods in this domain. Experimental results show a\nstrong performance of the proposed approach, which performs proficiently and\nconsistently across three distinct datasets with five sets of experiments. This\nstudy's outcomes indicate that the proposed method can be implemented in\nreal-world applications (even with non-English languages) for flagging sexual\npredators, offensive or toxic content, hate speech, and discriminatory language\nin online discussions and comments to maintain respectful internet or digital\ncommunities. Furthermore, it can be employed for solving text classification\nproblems with other potential applications such as sentiment analysis, spam and\nphishing detection, sorting legal documents, fake news detection, language\nidentification, user intent recognition, text-based product categorization,\nmedical record analysis, and resume screening.\n","authors":["Thanh Thi Nguyen","Campbell Wilson","Janis Dalins"],"pdf_url":"https://arxiv.org/pdf/2308.14683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12421v3","updated":"2023-08-28T16:15:21Z","published":"2023-05-21T10:40:55Z","title":"Evaluating Open-QA Evaluation","summary":" This study focuses on the evaluation of the Open Question Answering (Open-QA)\ntask, which can directly estimate the factuality of large language models\n(LLMs). Current automatic evaluation methods have shown limitations, indicating\nthat human evaluation still remains the most reliable approach. We introduce a\nnew task, Evaluating QA Evaluation (QA-Eval) and the corresponding dataset\nEVOUNA, designed to assess the accuracy of AI-generated answers in relation to\nstandard answers within Open-QA. Our evaluation of these methods utilizes\nhuman-annotated results to measure their performance. Specifically, the work\ninvestigates methods that show high correlation with human evaluations, deeming\nthem more reliable. We also discuss the pitfalls of current methods and methods\nto improve LLM-based evaluators. We believe this new QA-Eval task and\ncorresponding dataset EVOUNA will facilitate the development of more effective\nautomatic evaluation tools and prove valuable for future research in this area.\nAll resources are available at \\url{https://github.com/wangcunxiang/QA-Eval}\nand it is under the Apache-2.0 License.\n","authors":["Cunxiang Wang","Sirui Cheng","Qipeng Guo","Zhikun Xu","Bowen Ding","Yidong Wang","Xiangkun Hu","Zheng Zhang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.12421v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14669v1","updated":"2023-08-28T15:54:48Z","published":"2023-08-28T15:54:48Z","title":"ANER: Arabic and Arabizi Named Entity Recognition using\n Transformer-Based Approach","summary":" One of the main tasks of Natural Language Processing (NLP), is Named Entity\nRecognition (NER). It is used in many applications and also can be used as an\nintermediate step for other tasks. We present ANER, a web-based named entity\nrecognizer for the Arabic, and Arabizi languages. The model is built upon BERT,\nwhich is a transformer-based encoder. It can recognize 50 different entity\nclasses, covering various fields. We trained our model on the WikiFANE\\_Gold\ndataset which consists of Wikipedia articles. We achieved an F1 score of\n88.7\\%, which beats CAMeL Tools' F1 score of 83\\% on the ANERcorp dataset,\nwhich has only 4 classes. We also got an F1 score of 77.7\\% on the\nNewsFANE\\_Gold dataset which contains out-of-domain data from News articles.\nThe system is deployed on a user-friendly web interface that accepts users'\ninputs in Arabic, or Arabizi. It allows users to explore the entities in the\ntext by highlighting them. It can also direct users to get information about\nentities through Wikipedia directly. We added the ability to do NER using our\nmodel, or CAMeL Tools' model through our website. ANER is publicly accessible\nat \\url{http://www.aner.online}. We also deployed our model on HuggingFace at\nhttps://huggingface.co/boda/ANER, to allow developers to test and use it.\n","authors":["Abdelrahman \"Boda\" Sadallah","Omar Ahmed","Shimaa Mohamed","Omar Hatem","Doaa Hesham","Ahmed H. Yousef"],"pdf_url":"https://arxiv.org/pdf/2308.14669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14654v1","updated":"2023-08-28T15:36:33Z","published":"2023-08-28T15:36:33Z","title":"Joint Multiple Intent Detection and Slot Filling with Supervised\n Contrastive Learning and Self-Distillation","summary":" Multiple intent detection and slot filling are two fundamental and crucial\ntasks in spoken language understanding. Motivated by the fact that the two\ntasks are closely related, joint models that can detect intents and extract\nslots simultaneously are preferred to individual models that perform each task\nindependently. The accuracy of a joint model depends heavily on the ability of\nthe model to transfer information between the two tasks so that the result of\none task can correct the result of the other. In addition, since a joint model\nhas multiple outputs, how to train the model effectively is also challenging.\nIn this paper, we present a method for multiple intent detection and slot\nfilling by addressing these challenges. First, we propose a bidirectional joint\nmodel that explicitly employs intent information to recognize slots and slot\nfeatures to detect intents. Second, we introduce a novel method for training\nthe proposed joint model using supervised contrastive learning and\nself-distillation. Experimental results on two benchmark datasets MixATIS and\nMixSNIPS show that our method outperforms state-of-the-art models in both\ntasks. The results also demonstrate the contributions of both bidirectional\ndesign and the training method to the accuracy improvement. Our source code is\navailable at https://github.com/anhtunguyen98/BiSLU\n","authors":["Nguyen Anh Tu","Hoang Thi Thu Uyen","Tu Minh Phuong","Ngo Xuan Bach"],"pdf_url":"https://arxiv.org/pdf/2308.14654v1.pdf","comment":"Accepted at ECAI 2023"},{"id":"http://arxiv.org/abs/2306.11167v2","updated":"2023-08-28T15:34:27Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v2.pdf","comment":"V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3\n (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2308.14641v1","updated":"2023-08-28T15:12:34Z","published":"2023-08-28T15:12:34Z","title":"Challenges of GPT-3-based Conversational Agents for Healthca","summary":" The potential to provide patients with faster information access while\nallowing medical specialists to concentrate on critical tasks makes medical\ndomain dialog agents appealing. However, the integration of large-language\nmodels (LLMs) into these agents presents certain limitations that may result in\nserious consequences. This paper investigates the challenges and risks of using\nGPT-3-based models for medical question-answering (MedQA). We perform several\nevaluations contextualized in terms of standard medical principles. We provide\na procedure for manually designing patient queries to stress-test high-risk\nlimitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to\nrespond adequately to these queries, generating erroneous medical information,\nunsafe recommendations, and content that may be considered offensive.\n","authors":["Fabian Lechner","Allison Lahnala","Charles Welch","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.14641v1.pdf","comment":"12 pages, 9 Tables, accepted to RANLP 2023"},{"id":"http://arxiv.org/abs/2308.14634v1","updated":"2023-08-28T15:04:16Z","published":"2023-08-28T15:04:16Z","title":"Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance","summary":" We propose the use of conversational GPT models for easy and quick few-shot\ntext classification in the financial domain using the Banking77 dataset. Our\napproach involves in-context learning with GPT-3.5 and GPT-4, which minimizes\nthe technical expertise required and eliminates the need for expensive GPU\ncomputing while yielding quick and accurate results. Additionally, we fine-tune\nother pre-trained, masked language models with SetFit, a recent contrastive\nlearning technique, to achieve state-of-the-art results both in full-data and\nfew-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can\noutperform fine-tuned, non-generative models even with fewer examples. However,\nsubscription fees associated with these solutions may be considered costly for\nsmall organizations. Lastly, we find that generative models perform better on\nthe given task when shown representative samples selected by a human expert\nrather than when shown random ones. We conclude that a) our proposed methods\noffer a practical solution for few-shot tasks in datasets with limited label\navailability, and b) our state-of-the-art results can inspire future work in\nthe area.\n","authors":["Lefteris Loukas","Ilias Stogiannidis","Prodromos Malakasiotis","Stavros Vassos"],"pdf_url":"https://arxiv.org/pdf/2308.14634v1.pdf","comment":"Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023"},{"id":"http://arxiv.org/abs/2207.01964v3","updated":"2023-08-28T15:00:24Z","published":"2022-07-05T11:21:09Z","title":"Quantum Circuit Compiler for a Shuttling-Based Trapped-Ion Quantum\n Computer","summary":" The increasing capabilities of quantum computing hardware and the challenge\nof realizing deep quantum circuits require fully automated and efficient tools\nfor compiling quantum circuits. To express arbitrary circuits in a sequence of\nnative gates specific to the quantum computer architecture, it is necessary to\nmake algorithms portable across the landscape of quantum hardware providers. In\nthis work, we present a compiler capable of transforming and optimizing a\nquantum circuit targeting a shuttling-based trapped-ion quantum processor. It\nconsists of custom algorithms set on top of the quantum circuit framework\nPytket. The performance was evaluated for a wide range of quantum circuits and\nthe results show that the gate counts can be reduced by factors up to 5.1\ncompared to standard Pytket and up to 2.2 compared to standard Qiskit\ncompilation.\n","authors":["Fabian Kreppel","Christian Melzer","Diego Olvera Millán","Janis Wagner","Janine Hilder","Ulrich Poschinger","Ferdinand Schmidt-Kaler","André Brinkmann"],"pdf_url":"https://arxiv.org/pdf/2207.01964v3.pdf","comment":"35 pages, 25 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.14608v1","updated":"2023-08-28T14:23:04Z","published":"2023-08-28T14:23:04Z","title":"AI in the Gray: Exploring Moderation Policies in Dialogic Large Language\n Models vs. Human Answers in Controversial Topics","summary":" The introduction of ChatGPT and the subsequent improvement of Large Language\nModels (LLMs) have prompted more and more individuals to turn to the use of\nChatBots, both for information and assistance with decision-making. However,\nthe information the user is after is often not formulated by these ChatBots\nobjectively enough to be provided with a definite, globally accepted answer.\n Controversial topics, such as \"religion\", \"gender identity\", \"freedom of\nspeech\", and \"equality\", among others, can be a source of conflict as partisan\nor biased answers can reinforce preconceived notions or promote disinformation.\nBy exposing ChatGPT to such debatable questions, we aim to understand its level\nof awareness and if existing models are subject to socio-political and/or\neconomic biases. We also aim to explore how AI-generated answers compare to\nhuman ones. For exploring this, we use a dataset of a social media platform\ncreated for the purpose of debating human-generated claims on polemic subjects\namong users, dubbed Kialo.\n Our results show that while previous versions of ChatGPT have had important\nissues with controversial topics, more recent versions of ChatGPT\n(gpt-3.5-turbo) are no longer manifesting significant explicit biases in\nseveral knowledge areas. In particular, it is well-moderated regarding economic\naspects. However, it still maintains degrees of implicit libertarian leaning\ntoward right-winged ideals which suggest the need for increased moderation from\nthe socio-political point of view. In terms of domain knowledge on\ncontroversial topics, with the exception of the \"Philosophical\" category,\nChatGPT is performing well in keeping up with the collective human level of\nknowledge. Finally, we see that sources of Bing AI have slightly more tendency\nto the center when compared to human answers. All the analyses we make are\ngeneralizable to other types of biases and domains.\n","authors":["Vahid Ghafouri","Vibhor Agarwal","Yong Zhang","Nishanth Sastry","Jose Such","Guillermo Suarez-Tangil"],"pdf_url":"https://arxiv.org/pdf/2308.14608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12890v2","updated":"2023-08-28T14:16:42Z","published":"2023-08-24T16:09:13Z","title":"Large Language Models Vote: Prompting for Rare Disease Identification","summary":" The emergence of generative Large Language Models (LLMs) emphasizes the need\nfor accurate and efficient prompting approaches. LLMs are often applied in\nFew-Shot Learning (FSL) contexts, where tasks are executed with minimal\ntraining data. FSL has become popular in many Artificial Intelligence (AI)\nsubdomains, including AI for health. Rare diseases affect a small fraction of\nthe population. Rare disease identification from clinical notes inherently\nrequires FSL techniques due to limited data availability. Manual data\ncollection and annotation is both expensive and time-consuming. In this paper,\nwe propose Models-Vote Prompting (MVP), a flexible prompting approach for\nimproving the performance of LLM queries in FSL settings. MVP works by\nprompting numerous LLMs to perform the same tasks and then conducting a\nmajority vote on the resulting outputs. This method achieves improved results\nto any one model in the ensemble on one-shot rare disease identification and\nclassification tasks. We also release a novel rare disease dataset for FSL,\navailable to those who signed the MIMIC-IV Data Use Agreement (DUA).\nFurthermore, in using MVP, each model is prompted multiple times, substantially\nincreasing the time needed for manual annotation, and to address this, we\nassess the feasibility of using JSON for automating generative LLM evaluation.\n","authors":["David Oniani","Jordan Hilsman","Hang Dong","Fengyi Gao","Shiven Verma","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11891v2","updated":"2023-08-28T14:07:12Z","published":"2023-08-23T03:38:21Z","title":"Bridging the Gap: Deciphering Tabular Data Using Large Language Model","summary":" In the realm of natural language processing, the understanding of tabular\ndata has perpetually stood as a focal point of scholarly inquiry. The emergence\nof expansive language models, exemplified by the likes of ChatGPT, has ushered\nin a wave of endeavors wherein researchers aim to harness these models for\ntasks related to table-based question answering. Central to our investigative\npursuits is the elucidation of methodologies that amplify the aptitude of such\nlarge language models in discerning both the structural intricacies and\ninherent content of tables, ultimately facilitating their capacity to provide\ninformed responses to pertinent queries. To this end, we have architected a\ndistinctive module dedicated to the serialization of tables for seamless\nintegration with expansive language models. Additionally, we've instituted a\ncorrective mechanism within the model to rectify potential inaccuracies.\nExperimental results indicate that, although our proposed method trails the\nSOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about\n1.2% in tests on specific datasets. This research marks the first application\nof large language models to table-based question answering tasks, enhancing the\nmodel's comprehension of both table structures and content.\n","authors":["Hengyuan Zhang","Peng Chang","Zongcheng Ji"],"pdf_url":"https://arxiv.org/pdf/2308.11891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14536v1","updated":"2023-08-28T12:47:41Z","published":"2023-08-28T12:47:41Z","title":"Spoken Language Intelligence of Large Language Models for Language\n Learning","summary":" People have long hoped for a conversational system that can assist in\nreal-life situations, and recent progress on large language models (LLMs) is\nbringing this idea closer to reality. While LLMs are often impressive in\nperformance, their efficacy in real-world scenarios that demand expert\nknowledge remains unclear. LLMs are believed to hold the most potential and\nvalue in education, especially in the development of Artificial intelligence\n(AI) based virtual teachers capable of facilitating language learning. Our\nfocus is centered on evaluating the efficacy of LLMs in the realm of education,\nspecifically in the areas of spoken language learning which encompass\nphonetics, phonology, and second language acquisition. We introduce a new\nmultiple-choice question dataset to evaluate the effectiveness of LLMs in the\naforementioned scenarios, including understanding and application of spoken\nlanguage knowledge. In addition, we investigate the influence of various\nprompting techniques such as zero- and few-shot method (prepending the question\nwith question-answer exemplars), chain-of-thought (CoT, think step-by-step),\nin-domain exampler and external tools (Google, Wikipedia). We conducted\nlarge-scale evaluation on popular LLMs (20 distinct models) using these\nmethods. We achieved significant performance improvements compared to the\nzero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% ->\n63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different\nsizes have good understanding of concepts in phonetics, phonology, and second\nlanguage acquisition, but show limitations in reasoning for real-world\nproblems. Additionally, we also explore preliminary findings on conversational\ncommunication.\n","authors":["Linkai Peng","Baorian Nuchged","Yingming Gao"],"pdf_url":"https://arxiv.org/pdf/2308.14536v1.pdf","comment":"28 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2308.14533v1","updated":"2023-08-28T12:46:21Z","published":"2023-08-28T12:46:21Z","title":"A Multi-Task Semantic Decomposition Framework with Task-specific\n Pre-training for Few-Shot NER","summary":" The objective of few-shot named entity recognition is to identify named\nentities with limited labeled instances. Previous works have primarily focused\non optimizing the traditional token-wise classification framework, while\nneglecting the exploration of information based on NER data characteristics. To\naddress this issue, we propose a Multi-Task Semantic Decomposition Framework\nvia Joint Task-specific Pre-training (MSDP) for few-shot NER. Drawing\ninspiration from demonstration-based and contrastive learning, we introduce two\nnovel pre-training tasks: Demonstration-based Masked Language Modeling (MLM)\nand Class Contrastive Discrimination. These tasks effectively incorporate\nentity boundary information and enhance entity representation in Pre-trained\nLanguage Models (PLMs). In the downstream main task, we introduce a multi-task\njoint optimization framework with the semantic decomposing method, which\nfacilitates the model to integrate two different semantic information for\nentity classification. Experimental results of two few-shot NER benchmarks\ndemonstrate that MSDP consistently outperforms strong baselines by a large\nmargin. Extensive analyses validate the effectiveness and generalization of\nMSDP.\n","authors":["Guanting Dong","Zechen Wang","Jinxu Zhao","Gang Zhao","Daichi Guo","Dayuan Fu","Tingfeng Hui","Chen Zeng","Keqing He","Xuefeng Li","Liwen Wang","Xinyue Cui","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14533v1.pdf","comment":"Accepted by CIKM 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2308.14508v1","updated":"2023-08-28T11:53:40Z","published":"2023-08-28T11:53:40Z","title":"LongBench: A Bilingual, Multitask Benchmark for Long Context\n Understanding","summary":" Although large language models (LLMs) demonstrate impressive performance for\nmany language tasks, most of them can only handle texts a few thousand tokens\nlong, limiting their applications on longer sequence inputs, such as books,\nreports, and codebases. Recent works have proposed methods to improve LLMs'\nlong context capabilities by extending context windows and more sophisticated\nmemory mechanisms. However, comprehensive benchmarks tailored for evaluating\nlong context understanding are lacking. In this paper, we introduce LongBench,\nthe first bilingual, multi-task benchmark for long context understanding,\nenabling a more rigorous evaluation of long context understanding. LongBench\ncomprises 21 datasets across 6 task categories in both English and Chinese,\nwith an average length of 6,711 words (English) and 13,386 characters\n(Chinese). These tasks cover key long-text application areas including\nsingle-doc QA, multi-doc QA, summarization, few-shot learning, synthetic tasks,\nand code completion. All datasets in LongBench are standardized into a unified\nformat, allowing for effortless automatic evaluation of LLMs. Upon\ncomprehensive evaluation of 8 LLMs on LongBench, we find that: (1) Commercial\nmodel (GPT-3.5-Turbo-16k) outperforms other open-sourced models, but still\nstruggles on longer contexts. (2) Scaled position embedding and fine-tuning on\nlonger sequences lead to substantial improvement on long context understanding.\n(3) Context compression technique such as retrieval brings improvement for\nmodel with weak ability on long contexts, but the performance still lags behind\nmodels that have strong long context understanding capability. The code and\ndatasets are available at https://github.com/THUDM/LongBench.\n","authors":["Yushi Bai","Xin Lv","Jiajie Zhang","Hongchang Lyu","Jiankai Tang","Zhidian Huang","Zhengxiao Du","Xiao Liu","Aohan Zeng","Lei Hou","Yuxiao Dong","Jie Tang","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2308.14508v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2206.08955v4","updated":"2023-08-28T11:19:57Z","published":"2022-06-17T18:11:34Z","title":"Making first order linear logic a generating grammar","summary":" It is known that different categorial grammars have surface representation in\na fragment of first order multiplicative linear logic (MLL1). We show that the\nfragment of interest is equivalent to the recently introduced extended tensor\ntype calculus (ETTC). ETTC is a calculus of specific typed terms, which\nrepresent tuples of strings, more precisely bipartite graphs decorated with\nstrings. Types are derived from linear logic formulas, and rules correspond to\nconcrete operations on these string-labeled graphs, so that they can be\nconveniently visualized. This provides the above mentioned fragment of MLL1\nthat is relevant for language modeling not only with some alternative syntax\nand intuitive geometric representation, but also with an intrinsic deductive\nsystem, which has been absent.\n In this work we consider a non-trivial notationally enriched variation of the\npreviously introduced {\\bf ETTC}, which allows more concise and transparent\ncomputations. We present both a cut-free sequent calculus and a natural\ndeduction formalism.\n","authors":["Sergey Slavnov"],"pdf_url":"https://arxiv.org/pdf/2206.08955v4.pdf","comment":"Revised and extended version with detailed proofs. arXiv admin note:\n substantial text overlap with arXiv:2112.15253"},{"id":"http://arxiv.org/abs/2305.07358v2","updated":"2023-08-28T11:07:56Z","published":"2023-05-12T10:08:46Z","title":"Towards Versatile and Efficient Visual Knowledge Integration into\n Pre-trained Language Models with Cross-Modal Adapters","summary":" Humans learn language via multi-modal knowledge. However, due to the\ntext-only pre-training scheme, most existing pre-trained language models (PLMs)\nare hindered from the multi-modal information.\n To inject visual knowledge into PLMs, existing methods incorporate either the\ntext or image encoder of vision-language models (VLMs) to encode the visual\ninformation and update all the original parameters of PLMs for knowledge\nfusion.\n In this paper, we propose a new plug-and-play module, X-adapter, to flexibly\nleverage the aligned visual and textual knowledge learned in pre-trained VLMs\nand efficiently inject them into PLMs.\n Specifically, we insert X-adapters into PLMs, and only the added parameters\nare updated during adaptation.\n To fully exploit the potential in VLMs, X-adapters consist of two\nsub-modules, V-expert and T-expert, to fuse VLMs' image and text\nrepresentations, respectively.\n We can opt for activating different sub-modules depending on the downstream\ntasks.\n Experimental results show that our method can significantly improve the\nperformance on object-color reasoning and natural language understanding (NLU)\ntasks compared with PLM baselines.\n","authors":["Xinyun Zhang","Haochen Tan","Han Wu","Mingjie Zhan","Ding Liang","Bei Yu"],"pdf_url":"https://arxiv.org/pdf/2305.07358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14484v1","updated":"2023-08-28T10:51:11Z","published":"2023-08-28T10:51:11Z","title":"Multimodal Detection of Social Spambots in Twitter using Transformers","summary":" Although not all bots are malicious, the vast majority of them are\nresponsible for spreading misinformation and manipulating the public opinion\nabout several issues, i.e., elections and many more. Therefore, the early\ndetection of social spambots is crucial. Although there have been proposed\nmethods for detecting bots in social media, there are still substantial\nlimitations. For instance, existing research initiatives still extract a large\nnumber of features and train traditional machine learning algorithms or use\nGloVe embeddings and train LSTMs. However, feature extraction is a tedious\nprocedure demanding domain expertise. Also, language models based on\ntransformers have been proved to be better than LSTMs. Other approaches create\nlarge graphs and train graph neural networks requiring in this way many hours\nfor training and access to computational resources. To tackle these\nlimitations, this is the first study employing only the user description field\nand images of three channels denoting the type and content of tweets posted by\nthe users. Firstly, we create digital DNA sequences, transform them to 3d\nimages, and apply pretrained models of the vision domain, including\nEfficientNet, AlexNet, VGG16, etc. Next, we propose a multimodal approach,\nwhere we use TwHIN-BERT for getting the textual representation of the user\ndescription field and employ VGG16 for acquiring the visual representation for\nthe image modality. We propose three different fusion methods, namely\nconcatenation, gated multimodal unit, and crossmodal attention, for fusing the\ndifferent modalities and compare their performances. Extensive experiments\nconducted on the Cresci '17 dataset demonstrate valuable advantages of our\nintroduced approaches over state-of-the-art ones reaching Accuracy up to\n99.98%.\n","authors":["Loukas Ilias","Ioannis Michail Kazelidis","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2308.14484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14482v1","updated":"2023-08-28T10:44:18Z","published":"2023-08-28T10:44:18Z","title":"An Empirical Study of Consistency Regularization for End-to-End\n Speech-to-Text Translation","summary":" Consistency regularization methods, such as R-Drop (Liang et al., 2021) and\nCrossConST (Gao et al., 2023), have achieved impressive supervised and\nzero-shot performance in the neural machine translation (NMT) field. Can we\nalso boost end-to-end (E2E) speech-to-text translation (ST) by leveraging\nconsistency regularization? In this paper, we conduct empirical studies on\nintra-modal and cross-modal consistency and propose two training strategies,\nSimRegCR and SimZeroCR, for E2E ST in regular and zero-shot scenarios.\nExperiments on the MuST-C benchmark show that our approaches achieve\nstate-of-the-art (SOTA) performance in most translation directions. The\nanalyses prove that regularization brought by the intra-modal consistency,\ninstead of modality gap, is crucial for the regular E2E ST, and the cross-modal\nconsistency could close the modality gap and boost the zero-shot E2E ST\nperformance.\n","authors":["Pengzhi Gao","Ruiqing Zhang","Zhongjun He","Hua Wu","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.05090v3","updated":"2023-08-28T10:22:14Z","published":"2021-09-10T20:06:27Z","title":"Enhancing Self-Disclosure In Neural Dialog Models By Candidate\n Re-ranking","summary":" Neural language modelling has progressed the state-of-the-art in different\ndownstream Natural Language Processing (NLP) tasks. One such area is of\nopen-domain dialog modelling, neural dialog models based on GPT-2 such as\nDialoGPT have shown promising performance in single-turn conversation. However,\nsuch (neural) dialog models have been criticized for generating responses which\nalthough may have relevance to the previous human response, tend to quickly\ndissipate human interest and descend into trivial conversation. One reason for\nsuch performance is the lack of explicit conversation strategy being employed\nin human-machine conversation. Humans employ a range of conversation strategies\nwhile engaging in a conversation, one such key social strategies is\nSelf-disclosure(SD). A phenomenon of revealing information about one-self to\nothers. Social penetration theory (SPT) proposes that communication between two\npeople moves from shallow to deeper levels as the relationship progresses\nprimarily through self-disclosure. Disclosure helps in creating rapport among\nthe participants engaged in a conversation. In this paper, Self-disclosure\nenhancement architecture (SDEA) is introduced utilizing Self-disclosure Topic\nModel (SDTM) during inference stage of a neural dialog model to re-rank\nresponse candidates to enhance self-disclosure in single-turn responses from\nfrom the model.\n","authors":["Mayank Soni","Benjamin Cowan","Vincent Wade"],"pdf_url":"https://arxiv.org/pdf/2109.05090v3.pdf","comment":"10 pages, 3 figures, 2 table"},{"id":"http://arxiv.org/abs/2308.12086v2","updated":"2023-08-28T09:42:59Z","published":"2023-08-23T12:11:27Z","title":"Out of the Cage: How Stochastic Parrots Win in Cyber Security\n Environments","summary":" Large Language Models (LLMs) have gained widespread popularity across diverse\ndomains involving text generation, summarization, and various natural language\nprocessing tasks. Despite their inherent limitations, LLM-based designs have\nshown promising capabilities in planning and navigating open-world scenarios.\nThis paper introduces a novel application of pre-trained LLMs as agents within\ncybersecurity network environments, focusing on their utility for sequential\ndecision-making processes.\n We present an approach wherein pre-trained LLMs are leveraged as attacking\nagents in two reinforcement learning environments. Our proposed agents\ndemonstrate similar or better performance against state-of-the-art agents\ntrained for thousands of episodes in most scenarios and configurations. In\naddition, the best LLM agents perform similarly to human testers of the\nenvironment without any additional training process. This design highlights the\npotential of LLMs to efficiently address complex decision-making tasks within\ncybersecurity.\n Furthermore, we introduce a new network security environment named\nNetSecGame. The environment is designed to eventually support complex\nmulti-agent scenarios within the network security domain. The proposed\nenvironment mimics real network attacks and is designed to be highly modular\nand adaptable for various scenarios.\n","authors":["Maria Rigaki","Ondřej Lukáš","Carlos A. Catania","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.12086v2.pdf","comment":"Under review. 10 pages plus appendices, 7 figures, 4 tables. Edit:\n fix e-mails and code repository"},{"id":"http://arxiv.org/abs/2303.17650v3","updated":"2023-08-28T09:34:59Z","published":"2023-03-30T18:28:33Z","title":"Comparing Abstractive Summaries Generated by ChatGPT to Real Summaries\n Through Blinded Reviewers and Text Classification Algorithms","summary":" Large Language Models (LLMs) have gathered significant attention due to their\nimpressive performance on a variety of tasks. ChatGPT, developed by OpenAI, is\na recent addition to the family of language models and is being called a\ndisruptive technology by a few, owing to its human-like text-generation\ncapabilities. Although, many anecdotal examples across the internet have\nevaluated ChatGPT's strength and weakness, only a few systematic research\nstudies exist. To contribute to the body of literature of systematic research\non ChatGPT, we evaluate the performance of ChatGPT on Abstractive Summarization\nby the means of automated metrics and blinded human reviewers. We also build\nautomatic text classifiers to detect ChatGPT generated summaries. We found that\nwhile text classification algorithms can distinguish between real and generated\nsummaries, humans are unable to distinguish between real summaries and those\nproduced by ChatGPT.\n","authors":["Mayank Soni","Vincent Wade"],"pdf_url":"https://arxiv.org/pdf/2303.17650v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18004v2","updated":"2023-08-28T09:34:39Z","published":"2023-05-29T10:33:08Z","title":"The Effects of Political Martyrdom on Election Results: The\n Assassination of Abe","summary":" In developed nations assassinations are rare and thus the impact of such acts\non the electoral and political landscape is understudied. In this paper, we\nfocus on Twitter data to examine the effects of Japan's former Primer Minister\nAbe's assassination on the Japanese House of Councillors elections in 2022. We\nutilize sentiment analysis and emotion detection together with topic modeling\non over 2 million tweets and compare them against tweets during previous\nelection cycles. Our findings indicate that Twitter sentiments were negatively\nimpacted by the event in the short term and that social media attention span\nhas shortened. We also discuss how \"necropolitics\" affected the outcome of the\nelections in favor of the deceased's party meaning that there seems to have\nbeen an effect of Abe's death on the election outcome though the findings\nwarrant further investigation for conclusive results.\n","authors":["Miu Nicole Takagi"],"pdf_url":"https://arxiv.org/pdf/2305.18004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14436v1","updated":"2023-08-28T09:22:02Z","published":"2023-08-28T09:22:02Z","title":"Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware\n Pre-training for KBQA","summary":" Knowledge Base Question Answering (KBQA) aims to answer natural language\nquestions with factual information such as entities and relations in KBs.\nHowever, traditional Pre-trained Language Models (PLMs) are directly\npre-trained on large-scale natural language corpus, which poses challenges for\nthem in understanding and representing complex subgraphs in structured KBs. To\nbridge the gap between texts and structured KBs, we propose a Structured\nKnowledge-aware Pre-training method (SKP). In the pre-training stage, we\nintroduce two novel structured knowledge-aware tasks, guiding the model to\neffectively learn the implicit relationship and better representations of\ncomplex subgraphs. In downstream KBQA task, we further design an efficient\nlinearization strategy and an interval attention mechanism, which assist the\nmodel to better encode complex subgraphs and shield the interference of\nirrelevant subgraphs during reasoning respectively. Detailed experiments and\nanalyses on WebQSP verify the effectiveness of SKP, especially the significant\nimprovement in subgraph retrieval (+4.08% H@10).\n","authors":["Guanting Dong","Rumei Li","Sirui Wang","Yupeng Zhang","Yunsen Xian","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14436v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.14429v1","updated":"2023-08-28T09:06:28Z","published":"2023-08-28T09:06:28Z","title":"Biomedical Entity Linking with Triple-aware Pre-Training","summary":" Linking biomedical entities is an essential aspect in biomedical natural\nlanguage processing tasks, such as text mining and question answering. However,\na difficulty of linking the biomedical entities using current large language\nmodels (LLM) trained on a general corpus is that biomedical entities are\nscarcely distributed in texts and therefore have been rarely seen during\ntraining by the LLM. At the same time, those LLMs are not aware of high level\nsemantic connection between different biomedical entities, which are useful in\nidentifying similar concepts in different textual contexts. To cope with\naforementioned problems, some recent works focused on injecting knowledge graph\ninformation into LLMs. However, former methods either ignore the relational\nknowledge of the entities or lead to catastrophic forgetting. Therefore, we\npropose a novel framework to pre-train the powerful generative LLM by a corpus\nsynthesized from a KG. In the evaluations we are unable to confirm the benefit\nof including synonym, description or relational information.\n","authors":["Xi Yan","Cedric Möller","Ricardo Usbeck"],"pdf_url":"https://arxiv.org/pdf/2308.14429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14423v1","updated":"2023-08-28T09:04:03Z","published":"2023-08-28T09:04:03Z","title":"GADePo: Graph-Assisted Declarative Pooling Transformers for\n Document-Level Relation Extraction","summary":" Document-level relation extraction aims to identify relationships between\nentities within a document. Current methods rely on text-based encoders and\nemploy various hand-coded pooling heuristics to aggregate information from\nentity mentions and associated contexts. In this paper, we replace these rigid\npooling functions with explicit graph relations by leveraging the intrinsic\ngraph processing capabilities of the Transformer model. We propose a joint\ntext-graph Transformer model, and a graph-assisted declarative pooling (GADePo)\nspecification of the input which provides explicit and high-level instructions\nfor information aggregation. This allows the pooling process to be guided by\ndomain-specific knowledge or desired outcomes but still learned by the\nTransformer, leading to more flexible and customizable pooling strategies. We\nextensively evaluate our method across diverse datasets and models, and show\nthat our approach yields promising results that are comparable to those\nachieved by the hand-coded pooling functions.\n","authors":["Andrei C. Coman","Christos Theodoropoulos","Marie-Francine Moens","James Henderson"],"pdf_url":"https://arxiv.org/pdf/2308.14423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11117v3","updated":"2023-08-28T08:44:09Z","published":"2023-03-20T13:58:35Z","title":"EmotionIC: Emotional Inertia and Contagion-Driven Dependency Modeling\n for Emotion Recognition in Conversation","summary":" Emotion Recognition in Conversation (ERC) has attracted growing attention in\nrecent years as a result of the advancement and implementation of\nhuman-computer interface technologies. In this paper, we propose a novel\napproach to dependency modeling driven by Emotional Inertia and Contagion\n(EmotionIC) for ERC task. Our EmotionIC consists of three main components,\ni.e., Identity Masked Multi-Head Attention (IMMHA), Dialogue-based Gated\nRecurrent Unit (DiaGRU), and Skip-chain Conditional Random Field (SkipCRF).\nCompared to previous ERC models, EmotionIC can model a conversation more\nthoroughly at both the feature-extraction and classification levels. The\nproposed model attempts to integrate the advantages of attention- and\nrecurrence-based methods at the feature-extraction level. Specifically, IMMHA\nis applied to capture identity-based global contextual dependencies, while\nDiaGRU is utilized to extract speaker- and temporal-aware local contextual\ninformation. At the classification level, SkipCRF can explicitly mine complex\nemotional flows from higher-order neighboring utterances in the conversation.\nExperimental results show that our method can significantly outperform the\nstate-of-the-art models on four benchmark datasets. The ablation studies\nconfirm that our modules can effectively model emotional inertia and contagion.\n","authors":["Yingjian Liu","Jiang Li","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.11117v3.pdf","comment":"19 pages,10 figures"},{"id":"http://arxiv.org/abs/2307.07924v3","updated":"2023-08-28T08:38:38Z","published":"2023-07-16T02:11:34Z","title":"Communicative Agents for Software Development","summary":" Software engineering is a domain characterized by intricate decision-making\nprocesses, often relying on nuanced intuition and consultation. Recent\nadvancements in deep learning have started to revolutionize software\nengineering practices through elaborate designs implemented at various stages\nof software development. In this paper, we present an innovative paradigm that\nleverages large language models (LLMs) throughout the entire software\ndevelopment process, streamlining and unifying key processes through natural\nlanguage communication, thereby eliminating the need for specialized models at\neach phase. At the core of this paradigm lies ChatDev, a virtual chat-powered\nsoftware development company that mirrors the established waterfall model,\nmeticulously dividing the development process into four distinct chronological\nstages: designing, coding, testing, and documenting. Each stage engages a team\nof agents, such as programmers, code reviewers, and test engineers, fostering\ncollaborative dialogue and facilitating a seamless workflow. The chat chain\nacts as a facilitator, breaking down each stage into atomic subtasks. This\nenables dual roles, allowing for proposing and validating solutions through\ncontext-aware communication, leading to efficient resolution of specific\nsubtasks. The instrumental analysis of ChatDev highlights its remarkable\nefficacy in software generation, enabling the completion of the entire software\ndevelopment process in under seven minutes at a cost of less than one dollar.\nIt not only identifies and alleviates potential vulnerabilities but also\nrectifies potential hallucinations while maintaining commendable efficiency and\ncost-effectiveness. The potential of ChatDev unveils fresh possibilities for\nintegrating LLMs into the realm of software development.\n","authors":["Chen Qian","Xin Cong","Wei Liu","Cheng Yang","Weize Chen","Yusheng Su","Yufan Dang","Jiahao Li","Juyuan Xu","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2307.07924v3.pdf","comment":"https://github.com/OpenBMB/ChatDev"},{"id":"http://arxiv.org/abs/2307.08487v3","updated":"2023-08-28T08:35:28Z","published":"2023-07-17T13:49:52Z","title":"Latent Jailbreak: A Benchmark for Evaluating Text Safety and Output\n Robustness of Large Language Models","summary":" Considerable research efforts have been devoted to ensuring that large\nlanguage models (LLMs) align with human values and generate safe text. However,\nan excessive focus on sensitivity to certain topics can compromise the model's\nrobustness in following instructions, thereby impacting its overall performance\nin completing tasks. Previous benchmarks for jailbreaking LLMs have primarily\nfocused on evaluating the safety of the models without considering their\nrobustness. In this paper, we propose a benchmark that assesses both the safety\nand robustness of LLMs, emphasizing the need for a balanced approach. To\ncomprehensively study text safety and output robustness, we introduce a latent\njailbreak prompt dataset, each involving malicious instruction embedding.\nSpecifically, we instruct the model to complete a regular task, such as\ntranslation, with the text to be translated containing malicious instructions.\nTo further analyze safety and robustness, we design a hierarchical annotation\nframework. We present a systematic analysis of the safety and robustness of\nLLMs regarding the position of explicit normal instructions, word replacements\n(verbs in explicit normal instructions, target groups in malicious\ninstructions, cue words for explicit normal instructions), and instruction\nreplacements (different explicit normal instructions). Our results demonstrate\nthat current LLMs not only prioritize certain instruction verbs but also\nexhibit varying jailbreak rates for different instruction verbs in explicit\nnormal instructions. Code and data are available at\nhttps://github.com/qiuhuachuan/latent-jailbreak.\n","authors":["Huachuan Qiu","Shuai Zhang","Anqi Li","Hongliang He","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2307.08487v3.pdf","comment":"Code and data are available at\n https://github.com/qiuhuachuan/latent-jailbreak"},{"id":"http://arxiv.org/abs/2308.14391v1","updated":"2023-08-28T08:14:20Z","published":"2023-08-28T08:14:20Z","title":"FIRE: Food Image to REcipe generation","summary":" Food computing has emerged as a prominent multidisciplinary field of research\nin recent years. An ambitious goal of food computing is to develop end-to-end\nintelligent systems capable of autonomously producing recipe information for a\nfood image. Current image-to-recipe methods are retrieval-based and their\nsuccess depends heavily on the dataset size and diversity, as well as the\nquality of learned embeddings. Meanwhile, the emergence of powerful\nattention-based vision and language models presents a promising avenue for\naccurate and generalizable recipe generation, which has yet to be extensively\nexplored. This paper proposes FIRE, a novel multimodal methodology tailored to\nrecipe generation in the food computing domain, which generates the food title,\ningredients, and cooking instructions based on input food images. FIRE\nleverages the BLIP model to generate titles, utilizes a Vision Transformer with\na decoder for ingredient extraction, and employs the T5 model to generate\nrecipes incorporating titles and ingredients as inputs. We showcase two\npractical applications that can benefit from integrating FIRE with large\nlanguage model prompting: recipe customization to fit recipes to user\npreferences and recipe-to-code transformation to enable automated cooking\nprocesses. Our experimental findings validate the efficacy of our proposed\napproach, underscoring its potential for future advancements and widespread\nadoption in food computing.\n","authors":["Prateek Chhikara","Dhiraj Chaurasia","Yifan Jiang","Omkar Masur","Filip Ilievski"],"pdf_url":"https://arxiv.org/pdf/2308.14391v1.pdf","comment":"5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.03104v5","updated":"2023-08-28T08:11:12Z","published":"2023-07-06T16:26:34Z","title":"Efficient Domain Adaptation of Sentence Embeddings Using Adapters","summary":" Sentence embeddings enable us to capture the semantic similarity of short\ntexts. Most sentence embedding models are trained for general semantic textual\nsimilarity tasks. Therefore, to use sentence embeddings in a particular domain,\nthe model must be adapted to it in order to achieve good results. Usually, this\nis done by fine-tuning the entire sentence embedding model for the domain of\ninterest. While this approach yields state-of-the-art results, all of the\nmodel's weights are updated during fine-tuning, making this method\nresource-intensive. Therefore, instead of fine-tuning entire sentence embedding\nmodels for each target domain individually, we propose to train lightweight\nadapters. These domain-specific adapters do not require fine-tuning all\nunderlying sentence embedding model parameters. Instead, we only train a small\nnumber of additional parameters while keeping the weights of the underlying\nsentence embedding model fixed. Training domain-specific adapters allows always\nusing the same base model and only exchanging the domain-specific adapters to\nadapt sentence embeddings to a specific domain. We show that using adapters for\nparameter-efficient domain adaptation of sentence embeddings yields competitive\nperformance within 1% of a domain-adapted, entirely fine-tuned sentence\nembedding model while only training approximately 3.6% of the parameters.\n","authors":["Tim Schopf","Dennis N. Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.03104v5.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2302.04391v5","updated":"2023-08-28T08:02:47Z","published":"2023-02-09T01:09:57Z","title":"The Re-Label Method For Data-Centric Machine Learning","summary":" In industry deep learning application, our manually labeled data has a\ncertain number of noisy data. To solve this problem and achieve more than 90\nscore in dev dataset, we present a simple method to find the noisy data and\nre-label the noisy data by human, given the model predictions as references in\nhuman labeling. In this paper, we illustrate our idea for a broad set of deep\nlearning tasks, includes classification, sequence tagging, object detection,\nsequence generation, click-through rate prediction. The experimental results\nand human evaluation results verify our idea.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2302.04391v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09729v3","updated":"2023-08-28T07:37:36Z","published":"2023-08-17T16:59:50Z","title":"MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large\n Language Models","summary":" LLMs usually exhibit limitations in their ability to incorporate new\nknowledge, the generation of hallucinations, and the transparency of their\ndecision-making process. In this paper, we explore how to prompt LLMs with\nknowledge graphs (KG), working as a remedy to engage LLMs with up-to-date\nknowledge and elicit the reasoning pathways from LLMs. Specifically, we build a\nprompting pipeline that endows LLMs with the capability of comprehending KG\ninputs and inferring with a combined implicit knowledge and the retrieved\nexternal knowledge. In addition, we investigate eliciting the mind map on which\nLLMs perform the reasoning and generate the answers. It is identified that the\nproduced mind map exhibits the reasoning pathways of LLMs grounded on the\nontology of knowledge, hence bringing the prospects of probing and gauging LLM\ninference in production. The experiments on three question & answering datasets\nalso show that MindMap prompting leads to a striking empirical gain. For\ninstance, prompting a GPT-3.5 with MindMap yields an overwhelming performance\nover GPT-4 consistently. We also demonstrate that with structured facts\nretrieved from KG, MindMap can outperform a series of\nprompting-with-document-retrieval methods, benefiting from more accurate,\nconcise, and comprehensive knowledge from KGs.\n","authors":["Yilin Wen","Zifeng Wang","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09729v3.pdf","comment":"7 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2305.07011v4","updated":"2023-08-28T07:29:03Z","published":"2023-05-11T17:53:29Z","title":"Region-Aware Pretraining for Open-Vocabulary Object Detection with\n Vision Transformers","summary":" We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a\ncontrastive image-text pretraining recipe to bridge the gap between image-level\npretraining and open-vocabulary object detection. At the pretraining phase, we\npropose to randomly crop and resize regions of positional embeddings instead of\nusing the whole image positional embeddings. This better matches the use of\npositional embeddings at region-level in the detection finetuning phase. In\naddition, we replace the common softmax cross entropy loss in contrastive\nlearning with focal loss to better learn the informative yet difficult\nexamples. Finally, we leverage recent advances in novel object proposals to\nimprove open-vocabulary detection finetuning. We evaluate our full model on the\nLVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer.\nRO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best\nexisting approach by +7.8 points in addition to competitive zero-shot transfer\ndetection. Surprisingly, RO-ViT improves the image-level representation as well\nand achieves the state of the art on 9 out of 12 metrics on COCO and Flickr\nimage-text retrieval benchmarks, outperforming competitive approaches with\nlarger models.\n","authors":["Dahun Kim","Anelia Angelova","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2305.07011v4.pdf","comment":"CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B\n result"},{"id":"http://arxiv.org/abs/2209.02552v2","updated":"2023-08-28T07:15:51Z","published":"2022-09-06T15:01:06Z","title":"Explaining Machine Learning Models in Natural Conversations: Towards a\n Conversational XAI Agent","summary":" The goal of Explainable AI (XAI) is to design methods to provide insights\ninto the reasoning process of black-box models, such as deep neural networks,\nin order to explain them to humans. Social science research states that such\nexplanations should be conversational, similar to human-to-human explanations.\nIn this work, we show how to incorporate XAI in a conversational agent, using a\nstandard design for the agent comprising natural language understanding and\ngeneration components. We build upon an XAI question bank which we extend by\nquality-controlled paraphrases to understand the user's information needs. We\nfurther systematically survey the literature for suitable explanation methods\nthat provide the information to answer those questions, and present a\ncomprehensive list of suggestions. Our work is the first step towards truly\nnatural conversations about machine learning models with an explanation agent.\nThe comprehensive list of XAI questions and the corresponding explanation\nmethods may support other researchers in providing the necessary information to\naddress users' demands.\n","authors":["Van Bach Nguyen","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2209.02552v2.pdf","comment":"Accepted at The World Conference on eXplainable Artificial\n Intelligence 2023 (XAI-2023)"},{"id":"http://arxiv.org/abs/2308.14359v1","updated":"2023-08-28T07:11:27Z","published":"2023-08-28T07:11:27Z","title":"Effect of Attention and Self-Supervised Speech Embeddings on\n Non-Semantic Speech Tasks","summary":" Human emotion understanding is pivotal in making conversational technology\nmainstream. We view speech emotion understanding as a perception task which is\na more realistic setting. With varying contexts (languages, demographics, etc.)\ndifferent share of people perceive the same speech segment as a non-unanimous\nemotion. As part of the ACM Multimedia 2023 Computational Paralinguistics\nChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset\nof multilingual speakers and multi-label regression target of 'emotion share'\nor perception of that emotion. We demonstrate that the training scheme of\ndifferent foundation models dictates their effectiveness for tasks beyond\nspeech recognition, especially for non-semantic speech tasks like emotion\n understanding. This is a very complex task due to multilingual speakers,\nvariability in the target labels, and inherent imbalance in the regression\ndataset. Our results show that HuBERT-Large with a self-attention-based\nlight-weight sequence model provides 4.6% improvement over the reported\nbaseline.\n","authors":["Payal Mohapatra","Akash Pandey","Yueyuan Sui","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14359v1.pdf","comment":"Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges\n Track"},{"id":"http://arxiv.org/abs/2308.14353v1","updated":"2023-08-28T06:56:44Z","published":"2023-08-28T06:56:44Z","title":"ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large\n Language Models","summary":" The unprecedented performance of large language models (LLMs) requires\ncomprehensive and accurate evaluation. We argue that for LLMs evaluation,\nbenchmarks need to be comprehensive and systematic. To this end, we propose the\nZhuJiu benchmark, which has the following strengths: (1) Multi-dimensional\nability coverage: We comprehensively evaluate LLMs across 7 ability dimensions\ncovering 51 tasks. Especially, we also propose a new benchmark that focuses on\nknowledge ability of LLMs. (2) Multi-faceted evaluation methods collaboration:\nWe use 3 different yet complementary evaluation methods to comprehensively\nevaluate LLMs, which can ensure the authority and accuracy of the evaluation\nresults. (3) Comprehensive Chinese benchmark: ZhuJiu is the pioneering\nbenchmark that fully assesses LLMs in Chinese, while also providing equally\nrobust evaluation abilities in English. (4) Avoiding potential data leakage: To\navoid data leakage, we construct evaluation data specifically for 37 tasks. We\nevaluate 10 current mainstream LLMs and conduct an in-depth discussion and\nanalysis of their results. The ZhuJiu benchmark and open-participation\nleaderboard are publicly released at http://www.zhujiu-benchmark.com/ and we\nalso provide a demo video at https://youtu.be/qypkJ89L1Ic.\n","authors":["Baoli Zhang","Haining Xie","Pengfan Du","Junhao Chen","Pengfei Cao","Yubo Chen","Shengping Liu","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14352v1","updated":"2023-08-28T06:56:08Z","published":"2023-08-28T06:56:08Z","title":"EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models","summary":" Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a\nrevolution in machine intelligence, owing to their exceptional capabilities in\na wide range of machine learning tasks. However, the transition of LLMs from\ndata centers to edge devices presents a set of challenges and opportunities.\nWhile this shift can enhance privacy and availability, it is hampered by the\nenormous parameter sizes of these models, leading to impractical runtime costs.\nIn light of these considerations, we introduce EdgeMoE, the first on-device\ninference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant\nof sparse LLMs that exhibit nearly constant computational complexity as their\nparameter size scales. EdgeMoE achieves both memory and computational\nefficiency by strategically partitioning the model across the storage\nhierarchy. Specifically, non-expert weights are stored in the device's memory,\nwhile expert weights are kept in external storage and are fetched into memory\nonly when they are activated. This design is underpinned by a crucial insight\nthat expert weights, though voluminous, are infrequently accessed due to sparse\nactivation patterns. To further mitigate the overhead associated with expert\nI/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise\nbitwidth adaptation: This method reduces the size of expert weights with an\nacceptable level of accuracy loss. (2) Expert management: It predicts the\nexperts that will be activated in advance and preloads them into the\ncompute-I/O pipeline, thus further optimizing the process. In empirical\nevaluations conducted on well-established MoE LLMs and various edge devices,\nEdgeMoE demonstrates substantial memory savings and performance improvements\nwhen compared to competitive baseline solutions.\n","authors":["Rongjie Yi","Liwei Guo","Shiyun Wei","Ao Zhou","Shangguang Wang","Mengwei Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14346v1","updated":"2023-08-28T06:41:49Z","published":"2023-08-28T06:41:49Z","title":"DISC-MedLLM: Bridging General Large Language Models and Real-World\n Medical Consultation","summary":" We propose DISC-MedLLM, a comprehensive solution that leverages Large\nLanguage Models (LLMs) to provide accurate and truthful medical response in\nend-to-end conversational healthcare services. To construct high-quality\nSupervised Fine-Tuning (SFT) datasets, we employ three strategies: utilizing\nmedical knowledge-graphs, reconstructing real-world dialogues, and\nincorporating human-guided preference rephrasing. These datasets are\ninstrumental in training DISC-MedLLM, surpassing existing medical LLMs in both\nsingle-turn and multi-turn consultation scenarios. Extensive experimental\nresults demonstrate the effectiveness of the proposed model in bridging the gap\nbetween general language models and real-world medical consultation.\nAdditionally, we release the constructed dataset and model weights to further\ncontribute to research and development. Further details and resources can be\nfound at https://github.com/FudanDISC/DISC-MedLLM\n","authors":["Zhijie Bao","Wei Chen","Shengze Xiao","Kuang Ren","Jiaao Wu","Cheng Zhong","Jiajie Peng","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2308.14346v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.14337v1","updated":"2023-08-28T06:30:33Z","published":"2023-08-28T06:30:33Z","title":"Cognitive Effects in Large Language Models","summary":" Large Language Models (LLMs) such as ChatGPT have received enormous attention\nover the past year and are now used by hundreds of millions of people every\nday. The rapid adoption of this technology naturally raises questions about the\npossible biases such models might exhibit. In this work, we tested one of these\nmodels (GPT-3) on a range of cognitive effects, which are systematic patterns\nthat are usually found in human cognitive tasks. We found that LLMs are indeed\nprone to several human cognitive effects. Specifically, we show that the\npriming, distance, SNARC, and size congruity effects were presented with GPT-3,\nwhile the anchoring effect is absent. We describe our methodology, and\nspecifically the way we converted real-world experiments to text-based\nexperiments. Finally, we speculate on the possible reasons why GPT-3 exhibits\nthese effects and discuss whether they are imitated or reinvented.\n","authors":["Jonathan Shaki","Sarit Kraus","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2308.14337v1.pdf","comment":"Accepted and will be published in the ECAI conference"},{"id":"http://arxiv.org/abs/2308.14321v1","updated":"2023-08-28T06:05:18Z","published":"2023-08-28T06:05:18Z","title":"Leveraging A Medical Knowledge Graph into Large Language Models for\n Diagnosis Prediction","summary":" Electronic Health Records (EHRs) and routine documentation practices play a\nvital role in patients' daily care, providing a holistic record of health,\ndiagnoses, and treatment. However, complex and verbose EHR narratives overload\nhealthcare providers, risking diagnostic inaccuracies. While Large Language\nModels (LLMs) have showcased their potential in diverse language tasks, their\napplication in the healthcare arena needs to ensure the minimization of\ndiagnostic errors and the prevention of patient harm. In this paper, we outline\nan innovative approach for augmenting the proficiency of LLMs in the realm of\nautomated diagnosis generation, achieved through the incorporation of a medical\nknowledge graph (KG) and a novel graph model: Dr.Knows, inspired by the\nclinical diagnostic reasoning process. We derive the KG from the National\nLibrary of Medicine's Unified Medical Language System (UMLS), a robust\nrepository of biomedical knowledge. Our method negates the need for\npre-training and instead leverages the KG as an auxiliary instrument aiding in\nthe interpretation and summarization of complex medical concepts. Using\nreal-world hospital datasets, our experimental results demonstrate that the\nproposed approach of combining LLMs with KG has the potential to improve the\naccuracy of automated diagnosis generation. More importantly, our approach\noffers an explainable diagnostic pathway, edging us closer to the realization\nof AI-augmented diagnostic decision support systems.\n","authors":["Yanjun Gao","Ruizhe Li","John Caskey","Dmitriy Dligach","Timothy Miller","Matthew M. Churpek","Majid Afshar"],"pdf_url":"https://arxiv.org/pdf/2308.14321v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.03109v7","updated":"2023-08-28T05:50:53Z","published":"2023-07-06T16:28:35Z","title":"A Survey on Evaluation of Large Language Models","summary":" Large language models (LLMs) are gaining increasing popularity in both\nacademia and industry, owing to their unprecedented performance in various\napplications. As LLMs continue to play a vital role in both research and daily\nuse, their evaluation becomes increasingly critical, not only at the task\nlevel, but also at the society level for better understanding of their\npotential risks. Over the past years, significant efforts have been made to\nexamine LLMs from various perspectives. This paper presents a comprehensive\nreview of these evaluation methods for LLMs, focusing on three key dimensions:\nwhat to evaluate, where to evaluate, and how to evaluate. Firstly, we provide\nan overview from the perspective of evaluation tasks, encompassing general\nnatural language processing tasks, reasoning, medical usage, ethics,\neducations, natural and social sciences, agent applications, and other areas.\nSecondly, we answer the `where' and `how' questions by diving into the\nevaluation methods and benchmarks, which serve as crucial components in\nassessing performance of LLMs. Then, we summarize the success and failure cases\nof LLMs in different tasks. Finally, we shed light on several future challenges\nthat lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to\nresearchers in the realm of LLMs evaluation, thereby aiding the development of\nmore proficient LLMs. Our key point is that evaluation should be treated as an\nessential discipline to better assist the development of LLMs. We consistently\nmaintain the related open-source materials at:\nhttps://github.com/MLGroupJLU/LLM-eval-survey.\n","authors":["Yupeng Chang","Xu Wang","Jindong Wang","Yuan Wu","Linyi Yang","Kaijie Zhu","Hao Chen","Xiaoyuan Yi","Cunxiang Wang","Yidong Wang","Wei Ye","Yue Zhang","Yi Chang","Philip S. Yu","Qiang Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.03109v7.pdf","comment":"26 pages; a major update to include more recent works;\n https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2308.14306v1","updated":"2023-08-28T04:57:07Z","published":"2023-08-28T04:57:07Z","title":"Evaluating the Robustness to Instructions of Large Language Models","summary":" Recently, Instruction fine-tuning has risen to prominence as a potential\nmethod for enhancing the zero-shot capabilities of Large Language Models (LLMs)\non novel tasks. This technique has shown an exceptional ability to boost the\nperformance of moderately sized LLMs, sometimes even reaching performance\nlevels comparable to those of much larger model variants. The focus is on the\nrobustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an\nexploration of six models including Alpaca, Vicuna, WizardLM, and Traditional\nTask-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction\ndatasets as case studies. We carried out a comprehensive evaluation of these\ninstruction-following LLMs which have been tuned based on open-domain\ninstructions and task-oriented instructions. The main discussion is their\nperformance and robustness towards instructions. We have observed that in most\ncases, the model's performance in dealing with unfamiliar instructions tends to\nworsen significantly, and the robustness of the model for RE instructions\ndeteriorates compared to QA. Further, we discovered that up until a certain\nparameter size threshold (3B), the performance of the FLAN-T5 model improves as\nthe parameter count increases. The robustness of different scales of FLAN-T5\nmodels to RE instruction is worse than the robustness to QA instruction.\n","authors":["Yuansheng Ni","Sichao Jiang","Xinyu wu","Hui Shen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14306v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2308.06966v2","updated":"2023-08-28T04:12:30Z","published":"2023-08-14T06:49:53Z","title":"EcomGPT: Instruction-tuning Large Language Models with Chain-of-Task\n Tasks for E-commerce","summary":" Recently, instruction-following Large Language Models (LLMs) , represented by\nChatGPT, have exhibited exceptional performance in general Natural Language\nProcessing (NLP) tasks. However, the unique characteristics of E-commerce data\npose significant challenges to general LLMs. An LLM tailored specifically for\nE-commerce scenarios, possessing robust cross-dataset/task generalization\ncapabilities, is a pressing necessity. To solve this issue, in this work, we\nproposed the first e-commerce instruction dataset EcomInstruct, with a total of\n2.5 million instruction data. EcomInstruct scales up the data size and task\ndiversity by constructing atomic tasks with E-commerce basic data types, such\nas product information, user reviews. Atomic tasks are defined as intermediate\ntasks implicitly involved in solving a final task, which we also call\nChain-of-Task tasks. We developed EcomGPT with different parameter scales by\ntraining the backbone model BLOOMZ with the EcomInstruct. Benefiting from the\nfundamental semantic understanding capabilities acquired from the Chain-of-Task\ntasks, EcomGPT exhibits excellent zero-shot generalization capabilities.\nExtensive experiments and human evaluations demonstrate that EcomGPT\noutperforms ChatGPT in term of cross-dataset/task generalization on E-commerce\ntasks.\n","authors":["Yangning Li","Shirong Ma","Xiaobin Wang","Shen Huang","Chengyue Jiang","Hai-Tao Zheng","Pengjun Xie","Fei Huang","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.06966v2.pdf","comment":"Initial version of EcomGPT"},{"id":"http://arxiv.org/abs/2308.14280v1","updated":"2023-08-28T03:26:21Z","published":"2023-08-28T03:26:21Z","title":"FonMTL: Towards Multitask Learning for the Fon Language","summary":" The Fon language, spoken by an average 2 million of people, is a truly\nlow-resourced African language, with a limited online presence, and existing\ndatasets (just to name but a few). Multitask learning is a learning paradigm\nthat aims to improve the generalization capacity of a model by sharing\nknowledge across different but related tasks: this could be prevalent in very\ndata-scarce scenarios. In this paper, we present the first explorative approach\nto multitask learning, for model capabilities enhancement in Natural Language\nProcessing for the Fon language. Specifically, we explore the tasks of Named\nEntity Recognition (NER) and Part of Speech Tagging (POS) for Fon. We leverage\ntwo language model heads as encoders to build shared representations for the\ninputs, and we use linear layers blocks for classification relative to each\ntask. Our results on the NER and POS tasks for Fon, show competitive (or\nbetter) performances compared to several multilingual pretrained language\nmodels finetuned on single tasks. Additionally, we perform a few ablation\nstudies to leverage the efficiency of two different loss combination strategies\nand find out that the equal loss weighting approach works best in our case. Our\ncode is open-sourced at https://github.com/bonaventuredossou/multitask_fon.\n","authors":["Bonaventure F. P. Dossou","Iffanice Houndayi","Pamely Zantou","Gilles Hacheme"],"pdf_url":"https://arxiv.org/pdf/2308.14280v1.pdf","comment":"Accepted at WiNLP workshop, co-located at EMNLP 2023"},{"id":"http://arxiv.org/abs/2308.14272v1","updated":"2023-08-28T03:03:03Z","published":"2023-08-28T03:03:03Z","title":"Goodhart's Law Applies to NLP's Explanation Benchmarks","summary":" Despite the rising popularity of saliency-based explanations, the research\ncommunity remains at an impasse, facing doubts concerning their purpose,\nefficacy, and tendency to contradict each other. Seeking to unite the\ncommunity's efforts around common goals, several recent works have proposed\nevaluation metrics. In this paper, we critically examine two sets of metrics:\nthe ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics,\nfocusing our inquiry on natural language processing. First, we show that we can\ninflate a model's comprehensiveness and sufficiency scores dramatically without\naltering its predictions or explanations on in-distribution test inputs. Our\nstrategy exploits the tendency for extracted explanations and their complements\nto be \"out-of-support\" relative to each other and in-distribution inputs. Next,\nwe demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple\nmethod that encodes the label, even though EVAL-X is precisely motivated to\naddress such exploits. Our results raise doubts about the ability of current\nmetrics to guide explainability research, underscoring the need for a broader\nreassessment of what precisely these metrics are intended to capture.\n","authors":["Jennifer Hsia","Danish Pruthi","Aarti Singh","Zachary C. Lipton"],"pdf_url":"https://arxiv.org/pdf/2308.14272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14266v1","updated":"2023-08-28T02:48:49Z","published":"2023-08-28T02:48:49Z","title":"SalesBot 2.0: A Human-Like Intent-Guided Chit-Chat Dataset","summary":" In recent research on dialogue systems and corpora, there has been a\nsignificant focus on two distinct categories: task-oriented (TOD) and\nopen-domain (chit-chat) dialogues. TOD systems aim to satisfy specific user\ngoals, such as finding a movie to watch, whereas open-domain systems primarily\nfocus on generating engaging conversations. A recent study by Chiu et al.\n(2022) introduced SalesBot, which provides simulators and a dataset with\none-turn transition from chit-chat to task-oriented dialogues. However, the\npreviously generated data solely relied on BlenderBot, which raised concerns\nabout its long-turn naturalness and consistency during a conversation. To\naddress this issue, this paper aims to build SalesBot 2.0, a revised version of\nthe published data, by leveraging the commonsense knowledge of large language\nmodels (LLMs) through proper prompting. The objective is to gradually bridge\nthe gap between chit-chat and TOD towards better naturalness and consistency.\nThe newly released large-scale dataset with detailed annotations exhibits\nsmoother transitions between topics and is more human-like in terms of\nnaturalness and consistency. It can serve as a valuable resource for both\nacademic research and commercial applications. Furthermore, our proposed\nframework can be applied to generate numerous dialogues with various target\nintents.\n","authors":["Wen-Yu Chang","Yun-Nung Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14242v1","updated":"2023-08-28T01:05:18Z","published":"2023-08-28T01:05:18Z","title":"The Cultural Psychology of Large Language Models: Is ChatGPT a Holistic\n or Analytic Thinker?","summary":" The prevalent use of Large Language Models (LLMs) has necessitated studying\ntheir mental models, yielding noteworthy theoretical and practical\nimplications. Current research has demonstrated that state-of-the-art LLMs,\nsuch as ChatGPT, exhibit certain theory of mind capabilities and possess\nrelatively stable Big Five and/or MBTI personality traits. In addition,\ncognitive process features form an essential component of these mental models.\nResearch in cultural psychology indicated significant differences in the\ncognitive processes of Eastern and Western people when processing information\nand making judgments. While Westerners predominantly exhibit analytical\nthinking that isolates things from their environment to analyze their nature\nindependently, Easterners often showcase holistic thinking, emphasizing\nrelationships and adopting a global viewpoint. In our research, we probed the\ncultural cognitive traits of ChatGPT. We employed two scales that directly\nmeasure the cognitive process: the Analysis-Holism Scale (AHS) and the Triadic\nCategorization Task (TCT). Additionally, we used two scales that investigate\nthe value differences shaped by cultural thinking: the Dialectical Self Scale\n(DSS) and the Self-construal Scale (SCS). In cognitive process tests (AHS/TCT),\nChatGPT consistently tends towards Eastern holistic thinking, but regarding\nvalue judgments (DSS/SCS), ChatGPT does not significantly lean towards the East\nor the West. We suggest that the result could be attributed to both the\ntraining paradigm and the training data in LLM development. We discuss the\npotential value of this finding for AI research and directions for future\nresearch.\n","authors":["Chuanyang Jin","Songyang Zhang","Tianmin Shu","Zhihan Cui"],"pdf_url":"https://arxiv.org/pdf/2308.14242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01002v2","updated":"2023-08-28T00:57:19Z","published":"2023-04-03T14:06:47Z","title":"Does Human Collaboration Enhance the Accuracy of Identifying\n LLM-Generated Deepfake Texts?","summary":" Advances in Large Language Models (e.g., GPT-4, LLaMA) have improved the\ngeneration of coherent sentences resembling human writing on a large scale,\nresulting in the creation of so-called deepfake texts. However, this progress\nposes security and privacy concerns, necessitating effective solutions for\ndistinguishing deepfake texts from human-written ones. Although prior works\nstudied humans' ability to detect deepfake texts, none has examined whether\n\"collaboration\" among humans improves the detection of deepfake texts. In this\nstudy, to address this gap of understanding on deepfake texts, we conducted\nexperiments with two groups: (1) nonexpert individuals from the AMT platform\nand (2) writing experts from the Upwork platform. The results demonstrate that\ncollaboration among humans can potentially improve the detection of deepfake\ntexts for both groups, increasing detection accuracies by 6.36% for non-experts\nand 12.76% for experts, respectively, compared to individuals' detection\naccuracies. We further analyze the explanations that humans used for detecting\na piece of text as deepfake text, and find that the strongest indicator of\ndeepfake texts is their lack of coherence and consistency. Our study provides\nuseful insights for future tools and framework designs to facilitate the\ncollaborative human detection of deepfake texts. The experiment datasets and\nAMT implementations are available at:\nhttps://github.com/huashen218/llm-deepfake-human-study.git\n","authors":["Adaku Uchendu","Jooyoung Lee","Hua Shen","Thai Le","Ting-Hao 'Kenneth' Huang","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2304.01002v2.pdf","comment":"Accepted at The 11th AAAI Conference on Human Computation and\n Crowdsourcing (HCOMP 2023)"},{"id":"http://arxiv.org/abs/2305.17118v2","updated":"2023-08-28T22:48:46Z","published":"2023-05-26T17:39:58Z","title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for\n LLM KV Cache Compression at Test Time","summary":" Large language models(LLMs) have sparked a new wave of exciting AI\napplications. Hosting these models at scale requires significant memory\nresources. One crucial memory bottleneck for the deployment stems from the\ncontext window. It is commonly recognized that model weights are memory hungry;\nhowever, the size of key-value embedding stored during the generation process\n(KV cache) can easily surpass the model size. The enormous size of the KV cache\nputs constraints on the inference batch size, which is crucial for high\nthroughput inference workload. Inspired by an interesting observation of the\nattention scores, we hypothesize the persistence of importance: only pivotal\ntokens, which had a substantial influence at one step, will significantly\ninfluence future generations. Based on our empirical verification and\ntheoretical analysis around this hypothesis, we propose Scissorhands, a system\nthat maintains the memory usage of the KV cache at a fixed budget without\nfinetuning the model. In essence, Scissorhands manages the KV cache by storing\nthe pivotal tokens with a higher probability. We validate that Scissorhands\nreduces the inference memory usage of the KV cache by up to 5X without\ncompromising model quality. We further demonstrate that Scissorhands can be\ncombined with 4-bit quantization, traditionally used to compress model weights,\nto achieve up to 20X compression.\n","authors":["Zichang Liu","Aditya Desai","Fangshuo Liao","Weitao Wang","Victor Xie","Zhaozhuo Xu","Anastasios Kyrillidis","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2305.17118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14921v1","updated":"2023-08-28T22:32:05Z","published":"2023-08-28T22:32:05Z","title":"Gender bias and stereotypes in Large Language Models","summary":" Large Language Models (LLMs) have made substantial progress in the past\nseveral months, shattering state-of-the-art benchmarks in many domains. This\npaper investigates LLMs' behavior with respect to gender stereotypes, a known\nissue for prior models. We use a simple paradigm to test the presence of gender\nbias, building on but differing from WinoBias, a commonly used gender bias\ndataset, which is likely to be included in the training data of current LLMs.\nWe test four recently published LLMs and demonstrate that they express biased\nassumptions about men and women's occupations. Our contributions in this paper\nare as follows: (a) LLMs are 3-6 times more likely to choose an occupation that\nstereotypically aligns with a person's gender; (b) these choices align with\npeople's perceptions better than with the ground truth as reflected in official\njob statistics; (c) LLMs in fact amplify the bias beyond what is reflected in\nperceptions or the ground truth; (d) LLMs ignore crucial ambiguities in\nsentence structure 95% of the time in our study items, but when explicitly\nprompted, they recognize the ambiguity; (e) LLMs provide explanations for their\nchoices that are factually inaccurate and likely obscure the true reason behind\ntheir predictions. That is, they provide rationalizations of their biased\nbehavior. This highlights a key property of these models: LLMs are trained on\nimbalanced datasets; as such, even with the recent successes of reinforcement\nlearning with human feedback, they tend to reflect those imbalances back at us.\nAs with other types of societal biases, we suggest that LLMs must be carefully\ntested to ensure that they treat minoritized individuals and communities\nequitably.\n","authors":["Hadas Kotek","Rikker Dockum","David Q. Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14921v1.pdf","comment":"ACM Collective Intelligence"},{"id":"http://arxiv.org/abs/2308.14905v1","updated":"2023-08-28T21:16:08Z","published":"2023-08-28T21:16:08Z","title":"Neural approaches to spoken content embedding","summary":" Comparing spoken segments is a central operation to speech processing.\nTraditional approaches in this area have favored frame-level dynamic\nprogramming algorithms, such as dynamic time warping, because they require no\nsupervision, but they are limited in performance and efficiency. As an\nalternative, acoustic word embeddings -- fixed-dimensional vector\nrepresentations of variable-length spoken word segments -- have begun to be\nconsidered for such tasks as well. However, the current space of such\ndiscriminative embedding models, training approaches, and their application to\nreal-world downstream tasks is limited. We start by considering ``single-view\"\ntraining losses where the goal is to learn an acoustic word embedding model\nthat separates same-word and different-word spoken segment pairs. Then, we\nconsider ``multi-view\" contrastive losses. In this setting, acoustic word\nembeddings are learned jointly with embeddings of character sequences to\ngenerate acoustically grounded embeddings of written words, or acoustically\ngrounded word embeddings.\n In this thesis, we contribute new discriminative acoustic word embedding\n(AWE) and acoustically grounded word embedding (AGWE) approaches based on\nrecurrent neural networks (RNNs). We improve model training in terms of both\nefficiency and performance. We take these developments beyond English to\nseveral low-resource languages and show that multilingual training improves\nperformance when labeled data is limited. We apply our embedding models, both\nmonolingual and multilingual, to the downstream tasks of query-by-example\nspeech search and automatic speech recognition. Finally, we show how our\nembedding approaches compare with and complement more recent self-supervised\nspeech models.\n","authors":["Shane Settle"],"pdf_url":"https://arxiv.org/pdf/2308.14905v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2306.06826v2","updated":"2023-08-28T21:14:35Z","published":"2023-06-12T02:26:00Z","title":"When Do Annotator Demographics Matter? Measuring the Influence of\n Annotator Demographics with the POPQUORN Dataset","summary":" Annotators are not fungible. Their demographics, life experiences, and\nbackgrounds all contribute to how they label data. However, NLP has only\nrecently considered how annotator identity might influence their decisions.\nHere, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering,\nOffensiveness, text Rewriting, and politeness rating with demographic Nuance).\nPOPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a\nrepresentative sample regarding sex, age, and race as the US population.\nThrough a series of analyses, we show that annotators' background plays a\nsignificant role in their judgments. Further, our work shows that backgrounds\nnot previously considered in NLP (e.g., education), are meaningful and should\nbe considered. Our study suggests that understanding the background of\nannotators and collecting labels from a demographically balanced pool of crowd\nworkers is important to reduce the bias of datasets. The dataset, annotator\nbackground, and annotation interface are available at\nhttps://github.com/Jiaxin-Pei/potato-prolific-dataset .\n","authors":["Jiaxin Pei","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2306.06826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14903v1","updated":"2023-08-28T21:11:18Z","published":"2023-08-28T21:11:18Z","title":"MEMORY-VQ: Compression for Tractable Internet-Scale Memory","summary":" Retrieval augmentation is a powerful but expensive method to make language\nmodels more knowledgeable about the world. Memory-based methods like LUMEN\npre-compute token representations for retrieved passages to drastically speed\nup inference. However, memory also leads to much greater storage requirements\nfrom storing pre-computed representations.\n We propose MEMORY-VQ, a new method to reduce storage requirements of\nmemory-augmented models without sacrificing performance. Our method uses a\nvector quantization variational autoencoder (VQ-VAE) to compress token\nrepresentations. We apply MEMORY-VQ to the LUMEN model to obtain LUMEN-VQ, a\nmemory model that achieves a 16x compression rate with comparable performance\non the KILT benchmark. LUMEN-VQ enables practical retrieval augmentation even\nfor extremely large retrieval corpora.\n","authors":["Yury Zemlyanskiy","Michiel de Jong","Luke Vilnis","Santiago Ontañón","William W. Cohen","Sumit Sanghai","Joshua Ainslie"],"pdf_url":"https://arxiv.org/pdf/2308.14903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14894v1","updated":"2023-08-28T20:31:45Z","published":"2023-08-28T20:31:45Z","title":"Multiscale Contextual Learning for Speech Emotion Recognition in\n Emergency Call Center Conversations","summary":" Emotion recognition in conversations is essential for ensuring advanced\nhuman-machine interactions. However, creating robust and accurate emotion\nrecognition systems in real life is challenging, mainly due to the scarcity of\nemotion datasets collected in the wild and the inability to take into account\nthe dialogue context. The CEMO dataset, composed of conversations between\nagents and patients during emergency calls to a French call center, fills this\ngap. The nature of these interactions highlights the role of the emotional flow\nof the conversation in predicting patient emotions, as context can often make a\ndifference in understanding actual feelings. This paper presents a multi-scale\nconversational context learning approach for speech emotion recognition, which\ntakes advantage of this hypothesis. We investigated this approach on both\nspeech transcriptions and acoustic segments. Experimentally, our method uses\nthe previous or next information of the targeted segment. In the text domain,\nwe tested the context window using a wide range of tokens (from 10 to 100) and\nat the speech turns level, considering inputs from both the same and opposing\nspeakers. According to our tests, the context derived from previous tokens has\na more significant influence on accurate prediction than the following tokens.\nFurthermore, taking the last speech turn of the same speaker in the\nconversation seems useful. In the acoustic domain, we conducted an in-depth\nanalysis of the impact of the surrounding emotions on the prediction. While\nmulti-scale conversational context learning using Transformers can enhance\nperformance in the textual modality for emergency call recordings,\nincorporating acoustic context is more challenging.\n","authors":["Théo Deschamps-Berger","Lori Lamel","Laurence Devillers"],"pdf_url":"https://arxiv.org/pdf/2308.14894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19370v3","updated":"2023-08-28T20:13:33Z","published":"2023-05-30T19:25:51Z","title":"Blockwise Parallel Transformer for Large Context Models","summary":" Transformers have emerged as the cornerstone of state-of-the-art natural\nlanguage processing models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands posed by the\nself-attention mechanism and the large feedforward network in Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving multiple long sequences or long-term dependencies. We present a\ndistinct approach, Blockwise Parallel Transformer (BPT), that leverages\nblockwise computation of self-attention and feedforward network fusion to\nminimize memory costs. By processing longer input sequences while maintaining\nmemory efficiency, BPT enables training sequences 32 times longer than vanilla\nTransformers and up to 4 times longer than previous memory-efficient methods.\nExtensive experiments on language modeling and reinforcement learning tasks\ndemonstrate the effectiveness of BPT in reducing memory requirements and\nimproving performance.\n","authors":["Hao Liu","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2305.19370v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14873v1","updated":"2023-08-28T19:52:18Z","published":"2023-08-28T19:52:18Z","title":"CommunityFish: A Poisson-based Document Scaling With Hierarchical\n Clustering","summary":" Document scaling has been a key component in text-as-data applications for\nsocial scientists and a major field of interest for political researchers, who\naim at uncovering differences between speakers or parties with the help of\ndifferent probabilistic and non-probabilistic approaches. Yet, most of these\ntechniques are either built upon the agnostically bag-of-word hypothesis or use\nprior information borrowed from external sources that might embed the results\nwith a significant bias. If the corpus has long been considered as a collection\nof documents, it can also be seen as a dense network of connected words whose\nstructure could be clustered to differentiate independent groups of words,\nbased on their co-occurrences in documents, known as communities. This paper\nintroduces CommunityFish as an augmented version of Wordfish based on a\nhierarchical clustering, namely the Louvain algorithm, on the word space to\nyield communities as semantic and independent n-grams emerging from the corpus\nand use them as an input to Wordfish method, instead of considering the word\nspace. This strategy emphasizes the interpretability of the results, since\ncommunities have a non-overlapping structure, hence a crucial informative power\nin discriminating parties or speakers, in addition to allowing a faster\nexecution of the Poisson scaling model. Aside from yielding communities,\nassumed to be subtopic proxies, the application of this technique outperforms\nthe classic Wordfish model by highlighting historical developments in the U.S.\nState of the Union addresses and was found to replicate the prevailing\npolitical stance in Germany when using the corpus of parties' legislative\nmanifestos.\n","authors":["Sami Diaf"],"pdf_url":"https://arxiv.org/pdf/2308.14873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14850v1","updated":"2023-08-28T19:11:52Z","published":"2023-08-28T19:11:52Z","title":"Attention Visualizer Package: Revealing Word Importance for Deeper\n Insight into Encoder-Only Transformer Models","summary":" This report introduces the Attention Visualizer package, which is crafted to\nvisually illustrate the significance of individual words in encoder-only\ntransformer-based models. In contrast to other methods that center on tokens\nand self-attention scores, our approach will examine the words and their impact\non the final embedding representation. Libraries like this play a crucial role\nin enhancing the interpretability and explainability of neural networks. They\noffer the opportunity to illuminate their internal mechanisms, providing a\nbetter understanding of how they operate and can be enhanced. You can access\nthe code and review examples on the following GitHub repository:\nhttps://github.com/AlaFalaki/AttentionVisualizer.\n","authors":["Ala Alam Falaki","Robin Gras"],"pdf_url":"https://arxiv.org/pdf/2308.14850v1.pdf","comment":"12 pages, 15 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.14753v1","updated":"2023-08-28T17:59:47Z","published":"2023-08-28T17:59:47Z","title":"Efficient Discovery and Effective Evaluation of Visual Perceptual\n Similarity: A Benchmark and Beyond","summary":" Visual similarities discovery (VSD) is an important task with broad\ne-commerce applications. Given an image of a certain object, the goal of VSD is\nto retrieve images of different objects with high perceptual visual similarity.\nAlthough being a highly addressed problem, the evaluation of proposed methods\nfor VSD is often based on a proxy of an identification-retrieval task,\nevaluating the ability of a model to retrieve different images of the same\nobject. We posit that evaluating VSD methods based on identification tasks is\nlimited, and faithful evaluation must rely on expert annotations. In this\npaper, we introduce the first large-scale fashion visual similarity benchmark\ndataset, consisting of more than 110K expert-annotated image pairs. Besides\nthis major contribution, we share insight from the challenges we faced while\ncurating this dataset. Based on these insights, we propose a novel and\nefficient labeling procedure that can be applied to any dataset. Our analysis\nexamines its limitations and inductive biases, and based on these findings, we\npropose metrics to mitigate those limitations. Though our primary focus lies on\nvisual similarity, the methodologies we present have broader applications for\ndiscovering and evaluating perceptual similarity across various domains.\n","authors":["Oren Barkan","Tal Reiss","Jonathan Weill","Ori Katz","Roy Hirsch","Itzik Malkiel","Noam Koenigstein"],"pdf_url":"https://arxiv.org/pdf/2308.14753v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14749v1","updated":"2023-08-28T17:56:22Z","published":"2023-08-28T17:56:22Z","title":"MagicEdit: High-Fidelity and Temporally Coherent Video Editing","summary":" In this report, we present MagicEdit, a surprisingly simple yet effective\nsolution to the text-guided video editing task. We found that high-fidelity and\ntemporally coherent video-to-video translation can be achieved by explicitly\ndisentangling the learning of content, structure and motion signals during\ntraining. This is in contradict to most existing methods which attempt to\njointly model both the appearance and temporal representation within a single\nframework, which we argue, would lead to degradation in per-frame quality.\nDespite its simplicity, we show that MagicEdit supports various downstream\nvideo editing tasks, including video stylization, local editing, video-MagicMix\nand video outpainting.\n","authors":["Jun Hao Liew","Hanshu Yan","Jianfeng Zhang","Zhongcong Xu","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2308.14749v1.pdf","comment":"Project page: https://magic-edit.github.io/"},{"id":"http://arxiv.org/abs/2308.14748v1","updated":"2023-08-28T17:56:18Z","published":"2023-08-28T17:56:18Z","title":"MagicAvatar: Multimodal Avatar Generation and Animation","summary":" This report presents MagicAvatar, a framework for multimodal video generation\nand animation of human avatars. Unlike most existing methods that generate\navatar-centric videos directly from multimodal inputs (e.g., text prompts),\nMagicAvatar explicitly disentangles avatar video generation into two stages:\n(1) multimodal-to-motion and (2) motion-to-video generation. The first stage\ntranslates the multimodal inputs into motion/ control signals (e.g., human\npose, depth, DensePose); while the second stage generates avatar-centric video\nguided by these motion signals. Additionally, MagicAvatar supports avatar\nanimation by simply providing a few images of the target person. This\ncapability enables the animation of the provided human identity according to\nthe specific motion derived from the first stage. We demonstrate the\nflexibility of MagicAvatar through various applications, including text-guided\nand video-guided avatar generation, as well as multimodal avatar animation.\n","authors":["Jianfeng Zhang","Hanshu Yan","Zhongcong Xu","Jiashi Feng","Jun Hao Liew"],"pdf_url":"https://arxiv.org/pdf/2308.14748v1.pdf","comment":"Project page: https://magic-avatar.github.io/"},{"id":"http://arxiv.org/abs/2308.14746v1","updated":"2023-08-28T17:55:33Z","published":"2023-08-28T17:55:33Z","title":"CoVR: Learning Composed Video Retrieval from Web Video Captions","summary":" Composed Image Retrieval (CoIR) has recently gained popularity as a task that\nconsiders both text and image queries together, to search for relevant images\nin a database. Most CoIR approaches require manually annotated datasets,\ncomprising image-text-image triplets, where the text describes a modification\nfrom the query image to the target image. However, manual curation of CoIR\ntriplets is expensive and prevents scalability. In this work, we instead\npropose a scalable automatic dataset creation methodology that generates\ntriplets given video-caption pairs, while also expanding the scope of the task\nto include composed video retrieval (CoVR). To this end, we mine paired videos\nwith a similar caption from a large database, and leverage a large language\nmodel to generate the corresponding modification text. Applying this\nmethodology to the extensive WebVid2M collection, we automatically construct\nour WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we\nintroduce a new benchmark for CoVR with a manually annotated evaluation set,\nalong with baseline results. Our experiments further demonstrate that training\na CoVR model on our dataset effectively transfers to CoIR, leading to improved\nstate-of-the-art performance in the zero-shot setup on both the CIRR and\nFashionIQ benchmarks. Our code, datasets, and models are publicly available at\nhttps://imagine.enpc.fr/~ventural/covr.\n","authors":["Lucas Ventura","Antoine Yang","Cordelia Schmid","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2308.14746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04688v3","updated":"2023-08-28T17:51:52Z","published":"2023-04-10T16:08:59Z","title":"Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action\n Detection","summary":" The goal of spatial-temporal action detection is to determine the time and\nplace where each person's action occurs in a video and classify the\ncorresponding action category. Most of the existing methods adopt\nfully-supervised learning, which requires a large amount of training data,\nmaking it very difficult to achieve zero-shot learning. In this paper, we\npropose to utilize a pre-trained visual-language model to extract the\nrepresentative image and text features, and model the relationship between\nthese features through different interaction modules to obtain the interaction\nfeature. In addition, we use this feature to prompt each label to obtain more\nappropriate text features. Finally, we calculate the similarity between the\ninteraction feature and the text feature for each label to determine the action\ncategory. Our experiments on J-HMDB and UCF101-24 datasets demonstrate that the\nproposed interaction module and prompting make the visual-language features\nbetter aligned, thus achieving excellent accuracy for zero-shot spatio-temporal\naction detection. The code will be available at\nhttps://github.com/webber2933/iCLIP.\n","authors":["Wei-Jhe Huang","Jheng-Hsien Yeh","Min-Hung Chen","Gueter Josmy Faure","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2304.04688v3.pdf","comment":"Accepted by ICCVW 2023 (What is Next in Multimodal Foundation\n Models?)"},{"id":"http://arxiv.org/abs/2308.11487v2","updated":"2023-08-28T17:42:45Z","published":"2023-08-22T15:06:14Z","title":"Free Lunch for Gait Recognition: A Novel Relation Descriptor","summary":" Gait recognition is to seek correct matches for query individuals by their\nunique walking patterns. However, current methods focus solely on extracting\nindividual-specific features, overlooking inter-personal relationships. In this\npaper, we propose a novel $\\textbf{Relation Descriptor}$ that captures not only\nindividual features but also relations between test gaits and pre-selected\nanchored gaits. Specifically, we reinterpret classifier weights as anchored\ngaits and compute similarity scores between test features and these anchors,\nwhich re-expresses individual gait features into a similarity relation\ndistribution. In essence, the relation descriptor offers a holistic perspective\nthat leverages the collective knowledge stored within the classifier's weights,\nemphasizing meaningful patterns and enhancing robustness. Despite its\npotential, relation descriptor poses dimensionality challenges since its\ndimension depends on the training set's identity count. To address this, we\npropose the Farthest Anchored-gait Selection to identify the most\ndiscriminative anchored gaits and an Orthogonal Regularization to increase\ndiversity within anchored gaits. Compared to individual-specific features\nextracted from the backbone, our relation descriptor can boost the performances\nnearly without any extra costs. We evaluate the effectiveness of our method on\nthe popular GREW, Gait3D, CASIA-B, and OU-MVLP, showing that our method\nconsistently outperforms the baselines and achieves state-of-the-art\nperformances.\n","authors":["Jilong Wang","Saihui Hou","Yan Huang","Chunshui Cao","Xu Liu","Yongzhen Huang","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11487v2.pdf","comment":"Add new figures and fix some typos"},{"id":"http://arxiv.org/abs/2308.14740v1","updated":"2023-08-28T17:41:14Z","published":"2023-08-28T17:41:14Z","title":"Total Selfie: Generating Full-Body Selfies","summary":" We present a method to generate full-body selfies -- photos that you take of\nyourself, but capturing your whole body as if someone else took the photo of\nyou from a few feet away. Our approach takes as input a pre-captured video of\nyour body, a target pose photo, and a selfie + background pair for each\nlocation. We introduce a novel diffusion-based approach to combine all of this\ninformation into high quality, well-composed photos of you with the desired\npose and background.\n","authors":["Bowei Chen","Brian Curless","Ira Kemelmacher-Shlizerman","Steve Seitz"],"pdf_url":"https://arxiv.org/pdf/2308.14740v1.pdf","comment":"Project page:\n https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/"},{"id":"http://arxiv.org/abs/2308.14737v1","updated":"2023-08-28T17:38:31Z","published":"2023-08-28T17:38:31Z","title":"Flexible Techniques for Differentiable Rendering with 3D Gaussians","summary":" Fast, reliable shape reconstruction is an essential ingredient in many\ncomputer vision applications. Neural Radiance Fields demonstrated that\nphotorealistic novel view synthesis is within reach, but was gated by\nperformance requirements for fast reconstruction of real scenes and objects.\nSeveral recent approaches have built on alternative shape representations, in\nparticular, 3D Gaussians. We develop extensions to these renderers, such as\nintegrating differentiable optical flow, exporting watertight meshes and\nrendering per-ray normals. Additionally, we show how two of the recent methods\nare interoperable with each other. These reconstructions are quick, robust, and\neasily performed on GPU or CPU. For code and visual examples, see\nhttps://leonidk.github.io/fmb-plus\n","authors":["Leonid Keselman","Martial Hebert"],"pdf_url":"https://arxiv.org/pdf/2308.14737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14726v1","updated":"2023-08-28T17:30:14Z","published":"2023-08-28T17:30:14Z","title":"PanoSwin: a Pano-style Swin Transformer for Panorama Understanding","summary":" In panorama understanding, the widely used equirectangular projection (ERP)\nentails boundary discontinuity and spatial distortion. It severely deteriorates\nthe conventional CNNs and vision Transformers on panoramas. In this paper, we\npropose a simple yet effective architecture named PanoSwin to learn panorama\nrepresentations with ERP. To deal with the challenges brought by\nequirectangular projection, we explore a pano-style shift windowing scheme and\nnovel pitch attention to address the boundary discontinuity and the spatial\ndistortion, respectively. Besides, based on spherical distance and Cartesian\ncoordinates, we adapt absolute positional embeddings and relative positional\nbiases for panoramas to enhance panoramic geometry information. Realizing that\nplanar image understanding might share some common knowledge with panorama\nunderstanding, we devise a novel two-stage learning framework to facilitate\nknowledge transfer from the planar images to panoramas. We conduct experiments\nagainst the state-of-the-art on various panoramic tasks, i.e., panoramic object\ndetection, panoramic classification, and panoramic layout estimation. The\nexperimental results demonstrate the effectiveness of PanoSwin in panorama\nunderstanding.\n","authors":["Zhixin Ling","Zhen Xing","Xiangdong Zhou","Manliang Cao","Guichun Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14726v1.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2308.14713v1","updated":"2023-08-28T17:13:49Z","published":"2023-08-28T17:13:49Z","title":"R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras","summary":" Dense 3D reconstruction and ego-motion estimation are key challenges in\nautonomous driving and robotics. Compared to the complex, multi-modal systems\ndeployed today, multi-camera systems provide a simpler, low-cost alternative.\nHowever, camera-based 3D reconstruction of complex dynamic scenes has proven\nextremely difficult, as existing solutions often produce incomplete or\nincoherent results. We propose R3D3, a multi-camera system for dense 3D\nreconstruction and ego-motion estimation. Our approach iterates between\ngeometric estimation that exploits spatial-temporal information from multiple\ncameras, and monocular depth refinement. We integrate multi-camera feature\ncorrelation and dense bundle adjustment operators that yield robust geometric\ndepth and pose estimates. To improve reconstruction where geometric depth is\nunreliable, e.g. for moving objects or low-textured regions, we introduce\nlearnable scene priors via a depth refinement network. We show that this design\nenables a dense, consistent 3D reconstruction of challenging, dynamic outdoor\nenvironments. Consequently, we achieve state-of-the-art dense depth prediction\non the DDAD and NuScenes benchmarks.\n","authors":["Aron Schmied","Tobias Fischer","Martin Danelljan","Marc Pollefeys","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14713v1.pdf","comment":"Accepted to ICCV 2023. Project page is available at\n https://www.vis.xyz/pub/r3d3/"},{"id":"http://arxiv.org/abs/2308.14710v1","updated":"2023-08-28T17:10:12Z","published":"2023-08-28T17:10:12Z","title":"VideoCutLER: Surprisingly Simple Unsupervised Video Instance\n Segmentation","summary":" Existing approaches to unsupervised video instance segmentation typically\nrely on motion estimates and experience difficulties tracking small or\ndivergent motions. We present VideoCutLER, a simple method for unsupervised\nmulti-instance video segmentation without using motion-based learning signals\nlike optical flow or training on natural videos. Our key insight is that using\nhigh-quality pseudo masks and a simple video synthesis method for model\ntraining is surprisingly sufficient to enable the resulting video model to\neffectively segment and track multiple instances across video frames. We show\nthe first competitive unsupervised learning results on the challenging\nYouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous\nstate-of-the-art by a large margin. VideoCutLER can also serve as a strong\npretrained model for supervised video instance segmentation tasks, exceeding\nDINO by 15.9% on YouTubeVIS-2019 in terms of APvideo.\n","authors":["Xudong Wang","Ishan Misra","Ziyun Zeng","Rohit Girdhar","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2308.14710v1.pdf","comment":"Preprint. Code: https://github.com/facebookresearch/CutLER"},{"id":"http://arxiv.org/abs/2206.14996v2","updated":"2023-08-28T16:38:19Z","published":"2022-06-30T03:09:59Z","title":"Cross-domain Federated Object Detection","summary":" Detection models trained by one party (including server) may face severe\nperformance degradation when distributed to other users (clients). Federated\nlearning can enable multi-party collaborative learning without leaking client\ndata. In this paper, we focus on a special cross-domain scenario in which the\nserver has large-scale labeled data and multiple clients only have a small\namount of labeled data; meanwhile, there exist differences in data\ndistributions among the clients. In this case, traditional federated learning\nmethods can't help a client learn both the global knowledge of all participants\nand its own unique knowledge. To make up for this limitation, we propose a\ncross-domain federated object detection framework, named FedOD. The proposed\nframework first performs the federated training to obtain a public global\naggregated model through multi-teacher distillation, and sends the aggregated\nmodel back to each client for fine-tuning its personalized local model. After a\nfew rounds of communication, on each client we can perform weighted ensemble\ninference on the public global model and the personalized local model. We\nestablish a federated object detection dataset which has significant background\ndifferences and instance differences based on multiple public autonomous\ndriving datasets, and then conduct extensive experiments on the dataset. The\nexperimental results validate the effectiveness of the proposed method.\n","authors":["Shangchao Su","Bin Li","Chengzhi Zhang","Mingzhao Yang","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2206.14996v2.pdf","comment":"ICME 2023"},{"id":"http://arxiv.org/abs/2308.14686v1","updated":"2023-08-28T16:21:51Z","published":"2023-08-28T16:21:51Z","title":"360-Degree Panorama Generation from Few Unregistered NFoV Images","summary":" 360$^\\circ$ panoramas are extensively utilized as environmental light sources\nin computer graphics. However, capturing a 360$^\\circ$ $\\times$ 180$^\\circ$\npanorama poses challenges due to the necessity of specialized and costly\nequipment, and additional human resources. Prior studies develop various\nlearning-based generative methods to synthesize panoramas from a single Narrow\nField-of-View (NFoV) image, but they are limited in alterable input patterns,\ngeneration quality, and controllability. To address these issues, we propose a\nnovel pipeline called PanoDiff, which efficiently generates complete\n360$^\\circ$ panoramas using one or more unregistered NFoV images captured from\narbitrary angles. Our approach has two primary components to overcome the\nlimitations. Firstly, a two-stage angle prediction module to handle various\nnumbers of NFoV inputs. Secondly, a novel latent diffusion-based panorama\ngeneration model uses incomplete panorama and text prompts as control signals\nand utilizes several geometric augmentation schemes to ensure geometric\nproperties in generated panoramas. Experiments show that PanoDiff achieves\nstate-of-the-art panoramic generation quality and high controllability, making\nit suitable for applications such as content editing.\n","authors":["Jionghao Wang","Ziyu Chen","Jun Ling","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2308.14686v1.pdf","comment":"Accepted to ACM Multimedia 2023 (MM' 23). Code is available:\n https://github.com/shanemankiw/Panodiff"},{"id":"http://arxiv.org/abs/2308.14679v1","updated":"2023-08-28T16:15:23Z","published":"2023-08-28T16:15:23Z","title":"Video-Based Hand Pose Estimation for Remote Assessment of Bradykinesia\n in Parkinson's Disease","summary":" There is a growing interest in using pose estimation algorithms for\nvideo-based assessment of Bradykinesia in Parkinson's Disease (PD) to\nfacilitate remote disease assessment and monitoring. However, the accuracy of\npose estimation algorithms in videos from video streaming services during\nTelehealth appointments has not been studied. In this study, we used seven\noff-the-shelf hand pose estimation models to estimate the movement of the thumb\nand index fingers in videos of the finger-tapping (FT) test recorded from\nHealthy Controls (HC) and participants with PD and under two different\nconditions: streaming (videos recorded during a live Zoom meeting) and\non-device (videos recorded locally with high-quality cameras). The accuracy and\nreliability of the models were estimated by comparing the models' output with\nmanual results. Three of the seven models demonstrated good accuracy for\non-device recordings, and the accuracy decreased significantly for streaming\nrecordings. We observed a negative correlation between movement speed and the\nmodel's accuracy for the streaming recordings. Additionally, we evaluated the\nreliability of ten movement features related to bradykinesia extracted from\nvideo recordings of PD patients performing the FT test. While most of the\nfeatures demonstrated excellent reliability for on-device recordings, most of\nthe features demonstrated poor to moderate reliability for streaming\nrecordings. Our findings highlight the limitations of pose estimation\nalgorithms when applied to video recordings obtained during Telehealth visits,\nand demonstrate that on-device recordings can be used for automatic\nvideo-assessment of bradykinesia in PD.\n","authors":["Gabriela T. Acevedo Trebbau","Andrea Bandini","Diego L. Guarin"],"pdf_url":"https://arxiv.org/pdf/2308.14679v1.pdf","comment":"12 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.14667v1","updated":"2023-08-28T15:54:14Z","published":"2023-08-28T15:54:14Z","title":"Neural Network-Based Histologic Remission Prediction In Ulcerative\n Colitis","summary":" BACKGROUND & AIMS: Histological remission (HR) is advocated and considered as\na new therapeutic target in ulcerative colitis (UC). Diagnosis of histologic\nremission currently relies on biopsy; during this process, patients are at risk\nfor bleeding, infection, and post-biopsy fibrosis. In addition, histologic\nresponse scoring is complex and time-consuming, and there is heterogeneity\namong pathologists. Endocytoscopy (EC) is a novel ultra-high magnification\nendoscopic technique that can provide excellent in vivo assessment of glands.\nBased on the EC technique, we propose a neural network model that can assess\nhistological disease activity in UC using EC images to address the above\nissues. The experiment results demonstrate that the proposed method can assist\npatients in precise treatment and prognostic assessment.\n METHODS: We construct a neural network model for UC evaluation. A total of\n5105 images of 154 intestinal segments from 87 patients undergoing EC treatment\nat a center in China between March 2022 and March 2023 are scored according to\nthe Geboes score. Subsequently, 103 intestinal segments are used as the\ntraining set, 16 intestinal segments are used as the validation set for neural\nnetwork training, and the remaining 35 intestinal segments are used as the test\nset to measure the model performance together with the validation set.\n RESULTS: By treating HR as a negative category and histologic activity as a\npositive category, the proposed neural network model can achieve an accuracy of\n0.9, a specificity of 0.95, a sensitivity of 0.75, and an area under the curve\n(AUC) of 0.81.\n CONCLUSION: We develop a specific neural network model that can distinguish\nhistologic remission/activity in EC images of UC, which helps to accelerate\nclinical histological diagnosis.\n keywords: ulcerative colitis; Endocytoscopy; Geboes score; neural network.\n","authors":["Yemin li","Zhongcheng Liu","Xiaoying Lou","Mirigual Kurban","Miao Li","Jie Yang","Kaiwei Che","Jiankun Wang","Max Q. -H Meng","Yan Huang","Qin Guo","Pinjin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.14667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14650v1","updated":"2023-08-28T15:22:15Z","published":"2023-08-28T15:22:15Z","title":"Comparison of automated crater catalogs for Mars from Benedix et al.\n (2020) and Lee and Hogan (2021)","summary":" Crater mapping using neural networks and other automated methods has\nincreased recently with automated Crater Detection Algorithms (CDAs) applied to\nplanetary bodies throughout the solar system. A recent publication by Benedix\net al. (2020) showed high performance at small scales compared to similar\nautomated CDAs but with a net positive diameter bias in many crater candidates.\nI compare the publicly available catalogs from Benedix et al. (2020) and Lee &\nHogan (2021) and show that the reported performance is sensitive to the metrics\nused to test the catalogs. I show how the more permissive comparison methods\nindicate a higher CDA performance by allowing worse candidate craters to match\nground-truth craters. I show that the Benedix et al. (2020) catalog has a\nsubstantial performance loss with increasing latitude and identify an image\nprojection issue that might cause this loss. Finally, I suggest future\napplications of neural networks in generating large scientific datasets be\nvalidated using secondary networks with independent data sources or training\nmethods.\n","authors":["Christopher Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14650v1.pdf","comment":"14 pages, 6 figures. Accepted August 13th 2023"},{"id":"http://arxiv.org/abs/2212.09950v3","updated":"2023-08-28T15:09:46Z","published":"2022-12-20T01:59:27Z","title":"Domain Generalization with Correlated Style Uncertainty","summary":" Domain generalization (DG) approaches intend to extract domain invariant\nfeatures that can lead to a more robust deep learning model. In this regard,\nstyle augmentation is a strong DG method taking advantage of instance-specific\nfeature statistics containing informative style characteristics to synthetic\nnovel domains. While it is one of the state-of-the-art methods, prior works on\nstyle augmentation have either disregarded the interdependence amongst distinct\nfeature channels or have solely constrained style augmentation to linear\ninterpolation. To address these research gaps, in this work, we introduce a\nnovel augmentation approach, named Correlated Style Uncertainty (CSU),\nsurpassing the limitations of linear interpolation in style statistic space and\nsimultaneously preserving vital correlation information. Our method's efficacy\nis established through extensive experimentation on diverse cross-domain\ncomputer vision and medical imaging classification tasks: PACS, Office-Home,\nand Camelyon17 datasets, and the Duke-Market1501 instance retrieval task. The\nresults showcase a remarkable improvement margin over existing state-of-the-art\ntechniques. The source code is available https://github.com/freshman97/CSU.\n","authors":["Zheyuan Zhang","Bin Wang","Debesh Jha","Ugur Demir","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2212.09950v3.pdf","comment":"Accepted by WACV2024, camera ready version"},{"id":"http://arxiv.org/abs/2308.14626v1","updated":"2023-08-28T14:48:49Z","published":"2023-08-28T14:48:49Z","title":"VesselShot: Few-shot learning for cerebral blood vessel segmentation","summary":" Angiography is widely used to detect, diagnose, and treat cerebrovascular\ndiseases. While numerous techniques have been proposed to segment the vascular\nnetwork from different imaging modalities, deep learning (DL) has emerged as a\npromising approach. However, existing DL methods often depend on proprietary\ndatasets and extensive manual annotation. Moreover, the availability of\npre-trained networks specifically for medical domains and 3D volumes is\nlimited. To overcome these challenges, we propose a few-shot learning approach\ncalled VesselShot for cerebrovascular segmentation. VesselShot leverages\nknowledge from a few annotated support images and mitigates the scarcity of\nlabeled data and the need for extensive annotation in cerebral blood vessel\nsegmentation. We evaluated the performance of VesselShot using the publicly\navailable TubeTK dataset for the segmentation task, achieving a mean Dice\ncoefficient (DC) of 0.62(0.03).\n","authors":["Mumu Aktar","Hassan Rivaz","Marta Kersten-Oertel","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14619v1","updated":"2023-08-28T14:43:36Z","published":"2023-08-28T14:43:36Z","title":"Compositional Semantic Mix for Domain Adaptation in Point Cloud\n Segmentation","summary":" Deep-learning models for 3D point cloud semantic segmentation exhibit limited\ngeneralization capabilities when trained and tested on data captured with\ndifferent sensors or in varying environments due to domain shift. Domain\nadaptation methods can be employed to mitigate this domain shift, for instance,\nby simulating sensor noise, developing domain-agnostic generators, or training\npoint cloud completion networks. Often, these methods are tailored for range\nview maps or necessitate multi-modal input. In contrast, domain adaptation in\nthe image domain can be executed through sample mixing, which emphasizes input\ndata manipulation rather than employing distinct adaptation modules. In this\nstudy, we introduce compositional semantic mixing for point cloud domain\nadaptation, representing the first unsupervised domain adaptation technique for\npoint cloud segmentation based on semantic and geometric sample mixing. We\npresent a two-branch symmetric network architecture capable of concurrently\nprocessing point clouds from a source domain (e.g. synthetic) and point clouds\nfrom a target domain (e.g. real-world). Each branch operates within one domain\nby integrating selected data fragments from the other domain and utilizing\nsemantic information derived from source labels and target (pseudo) labels.\nAdditionally, our method can leverage a limited number of human point-level\nannotations (semi-supervised) to further enhance performance. We assess our\napproach in both synthetic-to-real and real-to-real scenarios using LiDAR\ndatasets and demonstrate that it significantly outperforms state-of-the-art\nmethods in both unsupervised and semi-supervised settings.\n","authors":["Cristiano Saltori","Fabio Galasso","Giuseppe Fiameni","Nicu Sebe","Fabio Poiesi","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2308.14619v1.pdf","comment":"TPAMI. arXiv admin note: text overlap with arXiv:2207.09778"},{"id":"http://arxiv.org/abs/2308.14616v1","updated":"2023-08-28T14:35:58Z","published":"2023-08-28T14:35:58Z","title":"VoroMesh: Learning Watertight Surface Meshes with Voronoi Diagrams","summary":" In stark contrast to the case of images, finding a concise, learnable\ndiscrete representation of 3D surfaces remains a challenge. In particular,\nwhile polygon meshes are arguably the most common surface representation used\nin geometry processing, their irregular and combinatorial structure often make\nthem unsuitable for learning-based applications. In this work, we present\nVoroMesh, a novel and differentiable Voronoi-based representation of watertight\n3D shape surfaces. From a set of 3D points (called generators) and their\nassociated occupancy, we define our boundary representation through the Voronoi\ndiagram of the generators as the subset of Voronoi faces whose two associated\n(equidistant) generators are of opposite occupancy: the resulting polygon mesh\nforms a watertight approximation of the target shape's boundary. To learn the\nposition of the generators, we propose a novel loss function, dubbed VoroLoss,\nthat minimizes the distance from ground truth surface samples to the closest\nfaces of the Voronoi diagram which does not require an explicit construction of\nthe entire Voronoi diagram. A direct optimization of the Voroloss to obtain\ngenerators on the Thingi32 dataset demonstrates the geometric efficiency of our\nrepresentation compared to axiomatic meshing algorithms and recent\nlearning-based mesh representations. We further use VoroMesh in a\nlearning-based mesh prediction task from input SDF grids on the ABC dataset,\nand show comparable performance to state-of-the-art methods while guaranteeing\nclosed output surfaces free of self-intersections.\n","authors":["Nissim Maruani","Roman Klokov","Maks Ovsjanikov","Pierre Alliez","Mathieu Desbrun"],"pdf_url":"https://arxiv.org/pdf/2308.14616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14613v1","updated":"2023-08-28T14:28:50Z","published":"2023-08-28T14:28:50Z","title":"MS-Net: A Multi-modal Self-supervised Network for Fine-Grained\n Classification of Aircraft in SAR Images","summary":" Synthetic aperture radar (SAR) imaging technology is commonly used to provide\n24-hour all-weather earth observation. However, it still has some drawbacks in\nSAR target classification, especially in fine-grained classification of\naircraft: aircrafts in SAR images have large intra-class diversity and\ninter-class similarity; the number of effective samples is insufficient and\nit's hard to annotate. To address these issues, this article proposes a novel\nmulti-modal self-supervised network (MS-Net) for fine-grained classification of\naircraft. Firstly, in order to entirely exploit the potential of multi-modal\ninformation, a two-sided path feature extraction network (TSFE-N) is\nconstructed to enhance the image feature of the target and obtain the domain\nknowledge feature of text mode. Secondly, a contrastive self-supervised\nlearning (CSSL) framework is employed to effectively learn useful\nlabel-independent feature from unbalanced data, a similarity per-ception loss\n(SPloss) is proposed to avoid network overfitting. Finally, TSFE-N is used as\nthe encoder of CSSL to obtain the classification results. Through a large\nnumber of experiments, our MS-Net can effectively reduce the difficulty of\nclassifying similar types of aircrafts. In the case of no label, the proposed\nalgorithm achieves an accuracy of 88.46% for 17 types of air-craft\nclassification task, which has pioneering significance in the field of\nfine-grained classification of aircraft in SAR images.\n","authors":["Bingying Yue","Jianhao Li","Hao Shi","Yupei Wang","Honghu Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.14613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14610v1","updated":"2023-08-28T14:26:15Z","published":"2023-08-28T14:26:15Z","title":"A Transformer-Conditioned Neural Fields Pipeline with Polar Coordinate\n Representation for Astronomical Radio Interferometric Data Reconstruction","summary":" In radio astronomy, visibility data, which are measurements of wave signals\nfrom radio telescopes, are transformed into images for observation of distant\ncelestial objects. However, these resultant images usually contain both real\nsources and artifacts, due to signal sparsity and other factors. One way to\nobtain cleaner images is to reconstruct samples into dense forms before\nimaging. Unfortunately, existing visibility reconstruction methods may miss\nsome components of the frequency data, so blurred object edges and persistent\nartifacts remain in the images. Furthermore, the computation overhead is high\non irregular visibility samples due to the data skew. To address these\nproblems, we propose PolarRec, a reconstruction method for interferometric\nvisibility data, which consists of a transformer-conditioned neural fields\npipeline with a polar coordinate representation. This representation matches\nthe way in which telescopes observe a celestial area as the Earth rotates. We\nfurther propose Radial Frequency Loss function, using radial coordinates in the\npolar coordinate system to correlate with the frequency information, to help\nreconstruct complete visibility. We also group visibility sample points by\nangular coordinates in the polar coordinate system, and use groups as the\ngranularity for subsequent encoding with a Transformer encoder. Consequently,\nour method can capture the inherent characteristics of visibility data\neffectively and efficiently. Our experiments demonstrate that PolarRec markedly\nimproves imaging results by faithfully reconstructing all frequency components\nin the visibility domain while significantly reducing the computation cost.\n","authors":["Ruoqi Wang","Qiong Luo","Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14605v1","updated":"2023-08-28T14:19:13Z","published":"2023-08-28T14:19:13Z","title":"A Generalization of Continuous Relaxation in Structured Pruning","summary":" Deep learning harnesses massive parallel floating-point processing to train\nand evaluate large neural networks. Trends indicate that deeper and larger\nneural networks with an increasing number of parameters achieve higher accuracy\nthan smaller neural networks. This performance improvement, which often\nrequires heavy compute for both training and evaluation, eventually needs to\ntranslate well to resource-constrained hardware for practical value. Structured\npruning asserts that while large networks enable us to find solutions to\ncomplex computer vision problems, a smaller, computationally efficient\nsub-network can be derived from the large neural network that retains model\naccuracy but significantly improves computational efficiency.\n We generalize structured pruning with algorithms for network augmentation,\npruning, sub-network collapse and removal. In addition, we demonstrate\nefficient and stable convergence up to 93% sparsity and 95% FLOPs reduction\nwithout loss of inference accuracy using with continuous relaxation matching or\nexceeding the state of the art for all structured pruning methods. The\nresulting CNN executes efficiently on GPU hardware without computationally\nexpensive sparse matrix operations. We achieve this with routine automatable\noperations on classification and segmentation problems using CIFAR-10,\nImageNet, and CityScapes datasets with the ResNet and U-NET network\narchitectures.\n","authors":["Brad Larson","Bishal Upadhyaya","Luke McDermott","Siddha Ganju"],"pdf_url":"https://arxiv.org/pdf/2308.14605v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2305.06310v3","updated":"2023-08-28T14:18:25Z","published":"2023-04-27T03:41:15Z","title":"SoGAR: Self-supervised Spatiotemporal Attention-based Social Group\n Activity Recognition","summary":" This paper introduces a novel approach to Social Group Activity Recognition\n(SoGAR) using Self-supervised Transformers network that can effectively utilize\nunlabeled video data. To extract spatio-temporal information, we created local\nand global views with varying frame rates. Our self-supervised objective\nensures that features extracted from contrasting views of the same video were\nconsistent across spatio-temporal domains. Our proposed approach is efficient\nin using transformer-based encoders to alleviate the weakly supervised setting\nof group activity recognition. By leveraging the benefits of transformer\nmodels, our approach can model long-term relationships along spatio-temporal\ndimensions. Our proposed SoGAR method achieved state-of-the-art results on\nthree group activity recognition benchmarks, namely JRDB-PAR, NBA, and\nVolleyball datasets, surpassing the current numbers in terms of F1-score, MCA,\nand MPCA metrics.\n","authors":["Naga VS Raviteja Chappa","Pha Nguyen","Alexander H Nelson","Han-Seok Seo","Xin Li","Page Daniel Dobbs","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2305.06310v3.pdf","comment":"Under review for PR journal; 32 pages, 7 figures. arXiv admin note:\n text overlap with arXiv:2303.12149"},{"id":"http://arxiv.org/abs/2308.14604v1","updated":"2023-08-28T14:17:16Z","published":"2023-08-28T14:17:16Z","title":"SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space\n Reconstruction","summary":" Segment Anything Model (SAM) has received remarkable attention as it offers a\npowerful and versatile solution for object segmentation in images. However,\nfine-tuning SAM for downstream segmentation tasks under different scenarios\nremains a challenge, as the varied characteristics of different scenarios\nnaturally requires diverse model parameter spaces. Most existing fine-tuning\nmethods attempt to bridge the gaps among different scenarios by introducing a\nset of new parameters to modify SAM's original parameter space. Unlike these\nworks, in this paper, we propose fine-tuning SAM efficiently by parameter space\nreconstruction (SAM-PARSER), which introduce nearly zero trainable parameters\nduring fine-tuning. In SAM-PARSER, we assume that SAM's original parameter\nspace is relatively complete, so that its bases are able to reconstruct the\nparameter space of a new scenario. We obtain the bases by matrix decomposition,\nand fine-tuning the coefficients to reconstruct the parameter space tailored to\nthe new scenario by an optimal linear combination of the bases. Experimental\nresults show that SAM-PARSER exhibits superior segmentation performance across\nvarious scenarios, while reducing the number of trainable parameters by\n$\\approx 290$ times compared with current parameter-efficient fine-tuning\nmethods.\n","authors":["Zelin Peng","Zhengqin Xu","Zhilin Zeng","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12149v4","updated":"2023-08-28T14:13:16Z","published":"2023-03-06T16:58:27Z","title":"SPARTAN: Self-supervised Spatiotemporal Transformers Approach to Group\n Activity Recognition","summary":" In this paper, we propose a new, simple, and effective Self-supervised\nSpatio-temporal Transformers (SPARTAN) approach to Group Activity Recognition\n(GAR) using unlabeled video data. Given a video, we create local and global\nSpatio-temporal views with varying spatial patch sizes and frame rates. The\nproposed self-supervised objective aims to match the features of these\ncontrasting views representing the same video to be consistent with the\nvariations in spatiotemporal domains. To the best of our knowledge, the\nproposed mechanism is one of the first works to alleviate the weakly supervised\nsetting of GAR using the encoders in video transformers. Furthermore, using the\nadvantage of transformer models, our proposed approach supports long-term\nrelationship modeling along spatio-temporal dimensions. The proposed SPARTAN\napproach performs well on two group activity recognition benchmarks, including\nNBA and Volleyball datasets, by surpassing the state-of-the-art results by a\nsignificant margin in terms of MCA and MPCA metrics.\n","authors":["Naga VS Raviteja Chappa","Pha Nguyen","Alexander H Nelson","Han-Seok Seo","Xin Li","Page Daniel Dobbs","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2303.12149v4.pdf","comment":"Accepted to CVPRW 2023; 11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14598v1","updated":"2023-08-28T14:09:03Z","published":"2023-08-28T14:09:03Z","title":"S-TREK: Sequential Translation and Rotation Equivariant Keypoints for\n local feature extraction","summary":" In this work we introduce S-TREK, a novel local feature extractor that\ncombines a deep keypoint detector, which is both translation and rotation\nequivariant by design, with a lightweight deep descriptor extractor. We train\nthe S-TREK keypoint detector within a framework inspired by reinforcement\nlearning, where we leverage a sequential procedure to maximize a reward\ndirectly related to keypoint repeatability. Our descriptor network is trained\nfollowing a \"detect, then describe\" approach, where the descriptor loss is\nevaluated only at those locations where keypoints have been selected by the\nalready trained detector. Extensive experiments on multiple benchmarks confirm\nthe effectiveness of our proposed method, with S-TREK often outperforming other\nstate-of-the-art methods in terms of repeatability and quality of the recovered\nposes, especially when dealing with in-plane rotations.\n","authors":["Emanuele Santellani","Christian Sormann","Mattia Rossi","Andreas Kuhn","Friedrich Fraundorfer"],"pdf_url":"https://arxiv.org/pdf/2308.14598v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14597v1","updated":"2023-08-28T14:09:02Z","published":"2023-08-28T14:09:02Z","title":"Adversarial Attacks on Foundational Vision Models","summary":" Rapid progress is being made in developing large, pretrained, task-agnostic\nfoundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are\napproaching the point where these models do not have to be finetuned\ndownstream, and can simply be used in zero-shot or with a lightweight probing\nhead. Critically, given the complexity of working at this scale, there is a\nbottleneck where relatively few organizations in the world are executing the\ntraining then sharing the models on centralized platforms such as HuggingFace\nand torch.hub. The goal of this work is to identify several key adversarial\nvulnerabilities of these models in an effort to make future designs more\nrobust. Intuitively, our attacks manipulate deep feature representations to\nfool an out-of-distribution (OOD) detector which will be required when using\nthese open-world-aware models to solve closed-set downstream tasks. Our methods\nreliably make in-distribution (ID) images (w.r.t. a downstream task) be\npredicted as OOD and vice versa while existing in extremely\nlow-knowledge-assumption threat models. We show our attacks to be potent in\nwhitebox and blackbox settings, as well as when transferred across foundational\nmodel types (e.g., attack DINOv2 with CLIP)! This work is only just the\nbeginning of a long journey towards adversarially robust foundational vision\nmodels.\n","authors":["Nathan Inkawhich","Gwendolyn McDonald","Ryan Luley"],"pdf_url":"https://arxiv.org/pdf/2308.14597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14596v1","updated":"2023-08-28T14:08:42Z","published":"2023-08-28T14:08:42Z","title":"LatentDR: Improving Model Generalization Through Sample-Aware Latent\n Degradation and Restoration","summary":" Despite significant advances in deep learning, models often struggle to\ngeneralize well to new, unseen domains, especially when training data is\nlimited. To address this challenge, we propose a novel approach for\ndistribution-aware latent augmentation that leverages the relationships across\nsamples to guide the augmentation procedure. Our approach first degrades the\nsamples stochastically in the latent space, mapping them to augmented labels,\nand then restores the samples from their corrupted versions during training.\nThis process confuses the classifier in the degradation step and restores the\noverall class distribution of the original samples, promoting diverse\nintra-class/cross-domain variability. We extensively evaluate our approach on a\ndiverse set of datasets and tasks, including domain generalization benchmarks\nand medical imaging datasets with strong domain shift, where we show our\napproach achieves significant improvements over existing methods for latent\nspace augmentation. We further show that our method can be flexibly adapted to\nlong-tail recognition tasks, demonstrating its versatility in building more\ngeneralizable models. Code is available at\nhttps://github.com/nerdslab/LatentDR.\n","authors":["Ran Liu","Sahil Khose","Jingyun Xiao","Lakshmi Sathidevi","Keerthan Ramnath","Zsolt Kira","Eva L. Dyer"],"pdf_url":"https://arxiv.org/pdf/2308.14596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14595v1","updated":"2023-08-28T14:06:36Z","published":"2023-08-28T14:06:36Z","title":"Neural Network Training Strategy to Enhance Anomaly Detection\n Performance: A Perspective on Reconstruction Loss Amplification","summary":" Unsupervised anomaly detection (UAD) is a widely adopted approach in industry\ndue to rare anomaly occurrences and data imbalance. A desirable characteristic\nof an UAD model is contained generalization ability which excels in the\nreconstruction of seen normal patterns but struggles with unseen anomalies.\nRecent studies have pursued to contain the generalization capability of their\nUAD models in reconstruction from different perspectives, such as design of\nneural network (NN) structure and training strategy. In contrast, we note that\ncontaining of generalization ability in reconstruction can also be obtained\nsimply from steep-shaped loss landscape. Motivated by this, we propose a loss\nlandscape sharpening method by amplifying the reconstruction loss, dubbed Loss\nAMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the\nreconstruction error on unseen anomalies becomes greater. Accordingly, the\nanomaly detection performance is improved without any change of the NN\narchitecture. Our findings suggest that LAMP can be easily applied to any\nreconstruction error metrics in UAD settings where the reconstruction model is\ntrained with anomaly-free samples only.\n","authors":["YeongHyeon Park","Sungho Kang","Myung Jin Kim","Hyeonho Jeong","Hyunkyu Park","Hyeong Seok Kim","Juneho Yi"],"pdf_url":"https://arxiv.org/pdf/2308.14595v1.pdf","comment":"5 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2211.16098v4","updated":"2023-08-28T14:03:09Z","published":"2022-11-29T11:17:34Z","title":"Three-stage binarization of color document images based on discrete\n wavelet transform and generative adversarial networks","summary":" The efficient segmentation of foreground text information from the background\nin degraded color document images is a critical challenge in the preservation\nof ancient manuscripts. The imperfect preservation of ancient manuscripts over\ntime has led to various types of degradation, such as staining, yellowing, and\nink seepage, significantly affecting image binarization results. This work\nproposes a three-stage method using Generative Adversarial Networks (GAN) for\nenhancing and binarizing degraded color document images through Discrete\nWavelet Transform (DWT). Stage-1 involves applying DWT and retaining the\nLow-Low (LL) subband images for image enhancement. In Stage-2, the original\ninput image is divided into four single-channel images (Red, Green, Blue, and\nGray), and each is trained with independent adversarial networks to extract\ncolor foreground information. In Stage-3, the output image from Stage-2 and the\noriginal input image are used to train independent adversarial networks for\ndocument binarization, enabling the integration of global and local features.\nThe experimental results demonstrate that our proposed method outperforms other\nclassic and state-of-the-art (SOTA) methods on the Document Image Binarization\nContest (DIBCO) datasets. We have released our implementation code at\nhttps://github.com/abcpp12383/ThreeStageBinarization.\n","authors":["Yu-Shian Lin","Rui-Yang Ju","Chih-Chia Chen","Chun-Tse Chien","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2211.16098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14583v1","updated":"2023-08-28T13:49:08Z","published":"2023-08-28T13:49:08Z","title":"Learning to Read Analog Gauges from Synthetic Data","summary":" Manually reading and logging gauge data is time inefficient, and the effort\nincreases according to the number of gauges available. We present a computer\nvision pipeline that automates the reading of analog gauges. We propose a\ntwo-stage CNN pipeline that identifies the key structural components of an\nanalog gauge and outputs an angular reading. To facilitate the training of our\napproach, a synthetic dataset is generated thus obtaining a set of realistic\nanalog gauges with their corresponding annotation. To validate our proposal, an\nadditional real-world dataset was collected with 4.813 manually curated images.\nWhen compared against state-of-the-art methodologies, our method shows a\nsignificant improvement of 4.55 in the average error, which is a 52% relative\nimprovement. The resources for this project will be made available at:\nhttps://github.com/fuankarion/automatic-gauge-reading.\n","authors":["Juan Leon-Alcazar","Yazeed Alnumay","Cheng Zheng","Hassane Trigui","Sahejad Patel","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2308.14583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14575v1","updated":"2023-08-28T13:40:47Z","published":"2023-08-28T13:40:47Z","title":"Referring Image Segmentation Using Text Supervision","summary":" Existing Referring Image Segmentation (RIS) methods typically require\nexpensive pixel-level or box-level annotations for supervision. In this paper,\nwe observe that the referring texts used in RIS already provide sufficient\ninformation to localize the target object. Hence, we propose a novel\nweakly-supervised RIS framework to formulate the target localization problem as\na classification process to differentiate between positive and negative text\nexpressions. While the referring text expressions for an image are used as\npositive expressions, the referring text expressions from other images can be\nused as negative expressions for this image. Our framework has three main\nnovelties. First, we propose a bilateral prompt method to facilitate the\nclassification process, by harmonizing the domain discrepancy between visual\nand linguistic features. Second, we propose a calibration method to reduce\nnoisy background information and improve the correctness of the response maps\nfor target object localization. Third, we propose a positive response map\nselection strategy to generate high-quality pseudo-labels from the enhanced\nresponse maps, for training a segmentation network for RIS inference. For\nevaluation, we propose a new metric to measure localization accuracy.\nExperiments on four benchmarks show that our framework achieves promising\nperformances to existing fully-supervised RIS methods while outperforming\nstate-of-the-art weakly-supervised methods adapted from related areas. Code is\navailable at https://github.com/fawnliu/TRIS.\n","authors":["Fang Liu","Yuhao Liu","Yuqiu Kong","Ke Xu","Lihe Zhang","Baocai Yin","Gerhard Hancke","Rynson Lau"],"pdf_url":"https://arxiv.org/pdf/2308.14575v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14570v1","updated":"2023-08-28T13:35:07Z","published":"2023-08-28T13:35:07Z","title":"SAAN: Similarity-aware attention flow network for change detection with\n VHR remote sensing images","summary":" Change detection (CD) is a fundamental and important task for monitoring the\nland surface dynamics in the earth observation field. Existing deep\nlearning-based CD methods typically extract bi-temporal image features using a\nweight-sharing Siamese encoder network and identify change regions using a\ndecoder network. These CD methods, however, still perform far from\nsatisfactorily as we observe that 1) deep encoder layers focus on irrelevant\nbackground regions and 2) the models' confidence in the change regions is\ninconsistent at different decoder stages. The first problem is because deep\nencoder layers cannot effectively learn from imbalanced change categories using\nthe sole output supervision, while the second problem is attributed to the lack\nof explicit semantic consistency preservation. To address these issues, we\ndesign a novel similarity-aware attention flow network (SAAN). SAAN\nincorporates a similarity-guided attention flow module with deeply supervised\nsimilarity optimization to achieve effective change detection. Specifically, we\ncounter the first issue by explicitly guiding deep encoder layers to discover\nsemantic relations from bi-temporal input images using deeply supervised\nsimilarity optimization. The extracted features are optimized to be\nsemantically similar in the unchanged regions and dissimilar in the changing\nregions. The second drawback can be alleviated by the proposed\nsimilarity-guided attention flow module, which incorporates similarity-guided\nattention modules and attention flow mechanisms to guide the model to focus on\ndiscriminative channels and regions. We evaluated the effectiveness and\ngeneralization ability of the proposed method by conducting experiments on a\nwide range of CD tasks. The experimental results demonstrate that our method\nachieves excellent performance on several CD tasks, with discriminative\nfeatures and semantic consistency preserved.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14570v1.pdf","comment":"15 pages,13 figures"},{"id":"http://arxiv.org/abs/2304.11609v3","updated":"2023-08-28T13:26:52Z","published":"2023-04-23T10:46:16Z","title":"PiClick: Picking the desired mask in click-based interactive\n segmentation","summary":" Click-based interactive segmentation aims to generate target masks via human\nclicking, which facilitates efficient pixel-level annotation and image editing.\nIn such a task, target ambiguity remains a problem hindering the accuracy and\nefficiency of segmentation. That is, in scenes with rich context, one click may\ncorrespond to multiple potential targets, while most previous interactive\nsegmentors only generate a single mask and fail to deal with target ambiguity.\nIn this paper, we propose a novel interactive segmentation network named\nPiClick, to yield all potentially reasonable masks and suggest the most\nplausible one for the user. Specifically, PiClick utilizes a Transformer-based\narchitecture to generate all potential target masks by mutually interactive\nmask queries. Moreover, a Target Reasoning module is designed in PiClick to\nautomatically suggest the user-desired mask from all candidates, relieving\ntarget ambiguity and extra-human efforts. Extensive experiments on 9\ninteractive segmentation datasets demonstrate PiClick performs favorably\nagainst previous state-of-the-arts considering the segmentation results.\nMoreover, we show that PiClick effectively reduces human efforts in annotating\nand picking the desired masks. To ease the usage and inspire future research,\nwe release the source code of PiClick together with a plug-and-play annotation\ntool at https://github.com/cilinyan/PiClick.\n","authors":["Cilin Yan","Haochen Wang","Jie Liu","Xiaolong Jiang","Yao Hu","Xu Tang","Guoliang Kang","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2304.11609v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2308.14551v1","updated":"2023-08-28T13:11:05Z","published":"2023-08-28T13:11:05Z","title":"Face Presentation Attack Detection by Excavating Causal Clues and\n Adapting Embedding Statistics","summary":" Recent face presentation attack detection (PAD) leverages domain adaptation\n(DA) and domain generalization (DG) techniques to address performance\ndegradation on unknown domains. However, DA-based PAD methods require access to\nunlabeled target data, while most DG-based PAD solutions rely on a priori,\ni.e., known domain labels. Moreover, most DA-/DG-based methods are\ncomputationally intensive, demanding complex model architectures and/or\nmulti-stage training processes. This paper proposes to model face PAD as a\ncompound DG task from a causal perspective, linking it to model optimization.\nWe excavate the causal factors hidden in the high-level representation via\ncounterfactual intervention. Moreover, we introduce a class-guided MixStyle to\nenrich feature-level data distribution within classes instead of focusing on\ndomain information. Both class-guided MixStyle and counterfactual intervention\ncomponents introduce no extra trainable parameters and negligible computational\nresources. Extensive cross-dataset and analytic experiments demonstrate the\neffectiveness and efficiency of our method compared to state-of-the-art PADs.\nThe implementation and the trained weights are publicly available.\n","authors":["Meiling Fang","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2308.14551v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2205.05249v2","updated":"2023-08-28T13:00:38Z","published":"2022-05-11T03:36:04Z","title":"Secure & Private Federated Neuroimaging","summary":" The amount of biomedical data continues to grow rapidly. However, collecting\ndata from multiple sites for joint analysis remains challenging due to\nsecurity, privacy, and regulatory concerns. To overcome this challenge, we use\nFederated Learning, which enables distributed training of neural network models\nover multiple data sources without sharing data. Each site trains the neural\nnetwork over its private data for some time, then shares the neural network\nparameters (i.e., weights, gradients) with a Federation Controller, which in\nturn aggregates the local models, sends the resulting community model back to\neach site, and the process repeats. Our Federated Learning architecture,\nMetisFL, provides strong security and privacy. First, sample data never leaves\na site. Second, neural network parameters are encrypted before transmission and\nthe global neural model is computed under fully-homomorphic encryption.\nFinally, we use information-theoretic methods to limit information leakage from\nthe neural model to prevent a curious site from performing model inversion or\nmembership attacks. We present a thorough evaluation of the performance of\nsecure, private federated learning in neuroimaging tasks, including for\npredicting Alzheimer's disease and estimating BrainAGE from magnetic resonance\nimaging (MRI) studies, in challenging, heterogeneous federated environments\nwhere sites have different amounts of data and statistical distributions.\n","authors":["Dimitris Stripelis","Umang Gupta","Hamza Saleem","Nikhil Dhinagar","Tanmay Ghai","Rafael Chrysovalantis Anastasiou","Armaghan Asghar","Greg Ver Steeg","Srivatsan Ravi","Muhammad Naveed","Paul M. Thompson","Jose Luis Ambite"],"pdf_url":"https://arxiv.org/pdf/2205.05249v2.pdf","comment":"18 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2212.06969v2","updated":"2023-08-28T12:51:20Z","published":"2022-12-14T01:28:12Z","title":"EgoLoc: Revisiting 3D Object Localization from Egocentric Videos with\n Visual Queries","summary":" With the recent advances in video and 3D understanding, novel 4D\nspatio-temporal methods fusing both concepts have emerged. Towards this\ndirection, the Ego4D Episodic Memory Benchmark proposed a task for Visual\nQueries with 3D Localization (VQ3D). Given an egocentric video clip and an\nimage crop depicting a query object, the goal is to localize the 3D position of\nthe center of that query object with respect to the camera pose of a query\nframe. Current methods tackle the problem of VQ3D by unprojecting the 2D\nlocalization results of the sibling task Visual Queries with 2D Localization\n(VQ2D) into 3D predictions. Yet, we point out that the low number of camera\nposes caused by camera re-localization from previous VQ3D methods severally\nhinders their overall success rate. In this work, we formalize a pipeline (we\ndub EgoLoc) that better entangles 3D multiview geometry with 2D object\nretrieval from egocentric videos. Our approach involves estimating more robust\ncamera poses and aggregating multi-view 3D displacements by leveraging the 2D\ndetection confidence, which enhances the success rate of object queries and\nleads to a significant improvement in the VQ3D baseline performance.\nSpecifically, our approach achieves an overall success rate of up to 87.12%,\nwhich sets a new state-of-the-art result in the VQ3D task. We provide a\ncomprehensive empirical analysis of the VQ3D task and existing solutions, and\nhighlight the remaining challenges in VQ3D. The code is available at\nhttps://github.com/Wayne-Mai/EgoLoc.\n","authors":["Jinjie Mai","Abdullah Hamdi","Silvio Giancola","Chen Zhao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2212.06969v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14525v1","updated":"2023-08-28T12:23:36Z","published":"2023-08-28T12:23:36Z","title":"Semi-Supervised Learning for Visual Bird's Eye View Semantic\n Segmentation","summary":" Visual bird's eye view (BEV) semantic segmentation helps autonomous vehicles\nunderstand the surrounding environment only from images, including static\nelements (e.g., roads) and dynamic elements (e.g., vehicles, pedestrians).\nHowever, the high cost of annotation procedures of full-supervised methods\nlimits the capability of the visual BEV semantic segmentation, which usually\nneeds HD maps, 3D object bounding boxes, and camera extrinsic matrixes. In this\npaper, we present a novel semi-supervised framework for visual BEV semantic\nsegmentation to boost performance by exploiting unlabeled images during the\ntraining. A consistency loss that makes full use of unlabeled data is then\nproposed to constrain the model on not only semantic prediction but also the\nBEV feature. Furthermore, we propose a novel and effective data augmentation\nmethod named conjoint rotation which reasonably augments the dataset while\nmaintaining the geometric relationship between the front-view images and the\nBEV semantic segmentation. Extensive experiments on the nuScenes and Argoverse\ndatasets show that our semi-supervised framework can effectively improve\nprediction accuracy. To the best of our knowledge, this is the first work that\nexplores improving visual BEV semantic segmentation performance using unlabeled\ndata. The code will be publicly available.\n","authors":["Junyu Zhu","Lina Liu","Yu Tang","Feng Wen","Wanlong Li","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13266v2","updated":"2023-08-28T12:02:25Z","published":"2023-08-25T09:37:51Z","title":"Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual\n Tracking and Segmentation","summary":" Tracking any given object(s) spatially and temporally is a common purpose in\nVisual Object Tracking (VOT) and Video Object Segmentation (VOS). Joint\ntracking and segmentation have been attempted in some studies but they often\nlack full compatibility of both box and mask in initialization and prediction,\nand mainly focus on single-object scenarios. To address these limitations, this\npaper proposes a Multi-object Mask-box Integrated framework for unified\nTracking and Segmentation, dubbed MITS. Firstly, the unified identification\nmodule is proposed to support both box and mask reference for initialization,\nwhere detailed object information is inferred from boxes or directly retained\nfrom masks. Additionally, a novel pinpoint box predictor is proposed for\naccurate multi-object box prediction, facilitating target-oriented\nrepresentation learning. All target objects are processed simultaneously from\nencoding to propagation and decoding, as a unified pipeline for VOT and VOS.\nExperimental results show MITS achieves state-of-the-art performance on both\nVOT and VOS benchmarks. Notably, MITS surpasses the best prior VOT competitor\nby around 6% on the GOT-10k test set, and significantly improves the\nperformance of box initialization on VOS benchmarks. The code is available at\nhttps://github.com/yoxu515/MITS.\n","authors":["Yuanyou Xu","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13266v2.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2211.00945v2","updated":"2023-08-28T11:36:06Z","published":"2022-11-02T08:09:03Z","title":"CarDD: A New Dataset for Vision-based Car Damage Detection","summary":" Automatic car damage detection has attracted significant attention in the car\ninsurance business. However, due to the lack of high-quality and publicly\navailable datasets, we can hardly learn a feasible model for car damage\ndetection. To this end, we contribute with Car Damage Detection (CarDD), the\nfirst public large-scale dataset designed for vision-based car damage detection\nand segmentation. Our CarDD contains 4,000 highresolution car damage images\nwith over 9,000 well-annotated instances of six damage categories. We detail\nthe image collection, selection, and annotation processes, and present a\nstatistical dataset analysis. Furthermore, we conduct extensive experiments on\nCarDD with state-of-the-art deep methods for different tasks and provide\ncomprehensive analyses to highlight the specialty of car damage detection.\nCarDD dataset and the source code are available at\nhttps://cardd-ustc.github.io.\n","authors":["Xinkuang Wang","Wenjing Li","Zhongcheng Wu"],"pdf_url":"https://arxiv.org/pdf/2211.00945v2.pdf","comment":"13 pages, 10 figures, full-length paper for Transactions on\n Intelligent Transportation Systems (2023)"},{"id":"http://arxiv.org/abs/2308.14500v1","updated":"2023-08-28T11:20:48Z","published":"2023-08-28T11:20:48Z","title":"LAC -- Latent Action Composition for Skeleton-based Action Segmentation","summary":" Skeleton-based action segmentation requires recognizing composable actions in\nuntrimmed videos. Current approaches decouple this problem by first extracting\nlocal visual features from skeleton sequences and then processing them by a\ntemporal model to classify frame-wise actions. However, their performances\nremain limited as the visual features cannot sufficiently express composable\nactions. In this context, we propose Latent Action Composition (LAC), a novel\nself-supervised framework aiming at learning from synthesized composable\nmotions for skeleton-based action segmentation. LAC is composed of a novel\ngeneration module towards synthesizing new sequences. Specifically, we design a\nlinear latent space in the generator to represent primitive motion. New\ncomposed motions can be synthesized by simply performing arithmetic operations\non latent representations of multiple input skeleton sequences. LAC leverages\nsuch synthesized sequences, which have large diversity and complexity, for\nlearning visual representations of skeletons in both sequence and frame spaces\nvia contrastive learning. The resulting visual encoder has a high expressive\npower and can be effectively transferred onto action segmentation tasks by\nend-to-end fine-tuning without the need for additional temporal models. We\nconduct a study focusing on transfer-learning and we show that representations\nlearned from pre-trained LAC outperform the state-of-the-art by a large margin\non TSU, Charades, PKU-MMD datasets.\n","authors":["Di Yang","Yaohui Wang","Antitza Dantcheva","Quan Kong","Lorenzo Garattoni","Gianpiero Francesca","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2308.14500v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2302.05968v2","updated":"2023-08-28T11:14:31Z","published":"2023-02-12T18:16:51Z","title":"Self-supervised pseudo-colorizing of masked cells","summary":" Self-supervised learning, which is strikingly referred to as the dark matter\nof intelligence, is gaining more attention in biomedical applications of deep\nlearning. In this work, we introduce a novel self-supervision objective for the\nanalysis of cells in biomedical microscopy images. We propose training deep\nlearning models to pseudo-colorize masked cells. We use a physics-informed\npseudo-spectral colormap that is well suited for colorizing cell topology. Our\nexperiments reveal that approximating semantic segmentation by\npseudo-colorization is beneficial for subsequent fine-tuning on cell detection.\nInspired by the recent success of masked image modeling, we additionally mask\nout cell parts and train to reconstruct these parts to further enrich the\nlearned representations. We compare our pre-training method with\nself-supervised frameworks including contrastive learning (SimCLR), masked\nautoencoders (MAEs), and edge-based self-supervision. We build upon our\nprevious work and train hybrid models for cell detection, which contain both\nconvolutional and vision transformer modules. Our pre-training method can\noutperform SimCLR, MAE-like masked image modeling, and edge-based\nself-supervision when pre-training on a diverse set of six fluorescence\nmicroscopy datasets. Code is available at:\nhttps://github.com/roydenwa/pseudo-colorize-masked-cells\n","authors":["Royden Wagner","Carlos Fernandez Lopez","Christoph Stiller"],"pdf_url":"https://arxiv.org/pdf/2302.05968v2.pdf","comment":"14 pages, 3 figures; Published in PLOS ONE"},{"id":"http://arxiv.org/abs/2308.14492v1","updated":"2023-08-28T11:10:14Z","published":"2023-08-28T11:10:14Z","title":"PointHPS: Cascaded 3D Human Pose and Shape Estimation from Point Clouds","summary":" Human pose and shape estimation (HPS) has attracted increasing attention in\nrecent years. While most existing studies focus on HPS from 2D images or videos\nwith inherent depth ambiguity, there are surging need to investigate HPS from\n3D point clouds as depth sensors have been frequently employed in commercial\ndevices. However, real-world sensory 3D points are usually noisy and\nincomplete, and also human bodies could have different poses of high diversity.\nTo tackle these challenges, we propose a principled framework, PointHPS, for\naccurate 3D HPS from point clouds captured in real-world settings, which\niteratively refines point features through a cascaded architecture.\nSpecifically, each stage of PointHPS performs a series of downsampling and\nupsampling operations to extract and collate both local and global cues, which\nare further enhanced by two novel modules: 1) Cross-stage Feature Fusion (CFF)\nfor multi-scale feature propagation that allows information to flow effectively\nthrough the stages, and 2) Intermediate Feature Enhancement (IFE) for\nbody-aware feature aggregation that improves feature quality after each stage.\nTo facilitate a comprehensive study under various scenarios, we conduct our\nexperiments on two large-scale benchmarks, comprising i) a dataset that\nfeatures diverse subjects and actions captured by real commercial sensors in a\nlaboratory environment, and ii) controlled synthetic data generated with\nrealistic considerations such as clothed humans in crowded outdoor scenes.\nExtensive experiments demonstrate that PointHPS, with its powerful point\nfeature extraction and processing scheme, outperforms State-of-the-Art methods\nby significant margins across the board. Homepage:\nhttps://caizhongang.github.io/projects/PointHPS/.\n","authors":["Zhongang Cai","Liang Pan","Chen Wei","Wanqi Yin","Fangzhou Hong","Mingyuan Zhang","Chen Change Loy","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05277v2","updated":"2023-08-28T10:51:09Z","published":"2023-04-11T15:23:29Z","title":"Graph-based Topology Reasoning for Driving Scenes","summary":" Understanding the road genome is essential to realize autonomous driving.\nThis highly intelligent problem contains two aspects - the connection\nrelationship of lanes, and the assignment relationship between lanes and\ntraffic elements, where a comprehensive topology reasoning method is vacant. On\none hand, previous map learning techniques struggle in deriving lane\nconnectivity with segmentation or laneline paradigms; or prior lane\ntopology-oriented approaches focus on centerline detection and neglect the\ninteraction modeling. On the other hand, the traffic element to lane assignment\nproblem is limited in the image domain, leaving how to construct the\ncorrespondence from two views an unexplored challenge. To address these issues,\nwe present TopoNet, the first end-to-end framework capable of abstracting\ntraffic knowledge beyond conventional perception tasks. To capture the driving\nscene topology, we introduce three key designs: (1) an embedding module to\nincorporate semantic knowledge from 2D elements into a unified feature space;\n(2) a curated scene graph neural network to model relationships and enable\nfeature interaction inside the network; (3) instead of transmitting messages\narbitrarily, a scene knowledge graph is devised to differentiate prior\nknowledge from various types of the road genome. We evaluate TopoNet on the\nchallenging scene understanding benchmark, OpenLane-V2, where our approach\noutperforms all previous works by a great margin on all perceptual and\ntopological metrics. The code is released at\nhttps://github.com/OpenDriveLab/TopoNet\n","authors":["Tianyu Li","Li Chen","Huijie Wang","Yang Li","Jiazhi Yang","Xiangwei Geng","Shengyin Jiang","Yuting Wang","Hang Xu","Chunjing Xu","Junchi Yan","Ping Luo","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2304.05277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10058v2","updated":"2023-08-28T10:46:22Z","published":"2023-03-17T15:38:39Z","title":"No Fear of Classifier Biases: Neural Collapse Inspired Federated\n Learning with Synthetic and Fixed Classifier","summary":" Data heterogeneity is an inherent challenge that hinders the performance of\nfederated learning (FL). Recent studies have identified the biased classifiers\nof local models as the key bottleneck. Previous attempts have used classifier\ncalibration after FL training, but this approach falls short in improving the\npoor feature representations caused by training-time classifier biases.\nResolving the classifier bias dilemma in FL requires a full understanding of\nthe mechanisms behind the classifier. Recent advances in neural collapse have\nshown that the classifiers and feature prototypes under perfect training\nscenarios collapse into an optimal structure called simplex equiangular tight\nframe (ETF). Building on this neural collapse insight, we propose a solution to\nthe FL's classifier bias problem by utilizing a synthetic and fixed ETF\nclassifier during training. The optimal classifier structure enables all\nclients to learn unified and optimal feature representations even under\nextremely heterogeneous data. We devise several effective modules to better\nadapt the ETF structure in FL, achieving both high generalization and\npersonalization. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet.\n","authors":["Zexi Li","Xinyi Shang","Rui He","Tao Lin","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2303.10058v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14481v1","updated":"2023-08-28T10:43:53Z","published":"2023-08-28T10:43:53Z","title":"Group Regression for Query Based Object Detection and Tracking","summary":" Group regression is commonly used in 3D object detection to predict box\nparameters of similar classes in a joint head, aiming to benefit from\nsimilarities while separating highly dissimilar classes. For query-based\nperception methods, this has, so far, not been feasible. We close this gap and\npresent a method to incorporate multi-class group regression, especially\ndesigned for the 3D domain in the context of autonomous driving, into existing\nattention and query-based perception approaches. We enhance a transformer based\njoint object detection and tracking model with this approach, and thoroughly\nevaluate its behavior and performance. For group regression, the classes of the\nnuScenes dataset are divided into six groups of similar shape and prevalence,\neach being regressed by a dedicated head. We show that the proposed method is\napplicable to many existing transformer based perception approaches and can\nbring potential benefits. The behavior of query group regression is thoroughly\nanalyzed in comparison to a unified regression head, e.g. in terms of\nclass-switching behavior and distribution of the output parameters. The\nproposed method offers many possibilities for further research, such as in the\ndirection of deep multi-hypotheses tracking.\n","authors":["Felicia Ruppel","Florian Faion","Claudius Gläser","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2308.14481v1.pdf","comment":"Accepted for publication at the 2023 26th IEEE International\n Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28,\n 2023, in Bilbao, Spain"},{"id":"http://arxiv.org/abs/2307.07205v3","updated":"2023-08-28T10:41:07Z","published":"2023-07-14T07:42:45Z","title":"Multimodal Motion Conditioned Diffusion Model for Skeleton-based Video\n Anomaly Detection","summary":" Anomalies are rare and anomaly detection is often therefore framed as\nOne-Class Classification (OCC), i.e. trained solely on normalcy. Leading OCC\ntechniques constrain the latent representations of normal motions to limited\nvolumes and detect as abnormal anything outside, which accounts satisfactorily\nfor the openset'ness of anomalies. But normalcy shares the same openset'ness\nproperty since humans can perform the same action in several ways, which the\nleading techniques neglect. We propose a novel generative model for video\nanomaly detection (VAD), which assumes that both normality and abnormality are\nmultimodal. We consider skeletal representations and leverage state-of-the-art\ndiffusion probabilistic models to generate multimodal future human poses. We\ncontribute a novel conditioning on the past motion of people and exploit the\nimproved mode coverage capabilities of diffusion processes to generate\ndifferent-but-plausible future motions. Upon the statistical aggregation of\nfuture modes, an anomaly is detected when the generated set of motions is not\npertinent to the actual future. We validate our model on 4 established\nbenchmarks: UBnormal, HR-UBnormal, HR-STC, and HR-Avenue, with extensive\nexperiments surpassing state-of-the-art results.\n","authors":["Alessandro Flaborea","Luca Collorone","Guido D'Amely","Stefano D'Arrigo","Bardh Prenkaj","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2307.07205v3.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.14480v1","updated":"2023-08-28T10:40:16Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14477v1","updated":"2023-08-28T10:30:08Z","published":"2023-08-28T10:30:08Z","title":"Medical needle tip tracking based on Optical Imaging and AI","summary":" Deep needle insertion to a target often poses a huge challenge, requiring a\ncombination of specialized skills, assistive technology, and extensive\ntraining. One of the frequently encountered medical scenarios demanding such\nexpertise includes the needle insertion into a femoral vessel in the groin.\nAfter the access to the femoral vessel, various medical procedures, such as\ncardiac catheterization and extracorporeal membrane oxygenation (ECMO) can be\nperformed. However, even with the aid of Ultrasound imaging, achieving\nsuccessful insertion can necessitate multiple attempts due to the complexities\nof anatomy and tissue deformation. To address this challenge, this paper\npresents an innovative technology for needle tip real-time tracking, aiming for\nenhanced needle insertion guidance. Specifically, our approach revolves around\nthe creation of scattering imaging using an optical fiber-equipped needle, and\nuses Convolutional Neural Network (CNN) based algorithms to enable real-time\nestimation of the needle tip's position and orientation during insertion\nprocedures. The efficacy of the proposed technology was rigorously evaluated\nthrough three experiments. The first two experiments involved rubber and bacon\nphantoms to simulate groin anatomy. The positional errors averaging 2.3+1.5mm\nand 2.0+1.2mm, and the orientation errors averaging 0.2+0.11rad and\n0.16+0.1rad. Furthermore, the system's capabilities were validated through\nexperiments conducted on fresh porcine phantom mimicking more complex\nanatomical structures, yielding positional accuracy results of 3.2+3.1mm and\norientational accuracy of 0.19+0.1rad. Given the average femoral arterial\nradius of 4 to 5mm, the proposed system is demonstrated with a great potential\nfor precise needle guidance in femoral artery insertion procedures. In\naddition, the findings highlight the broader potential applications of the\nsystem in the medical field.\n","authors":["Zhuoqi Cheng","Simon Lyck Bjært Sørensen","Mikkel Werge Olsen","René Lynge Eriksen","Thiusius Rajeeth Savarimuthu"],"pdf_url":"https://arxiv.org/pdf/2308.14477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08451v3","updated":"2023-08-28T10:22:23Z","published":"2023-04-17T17:21:21Z","title":"Efficient Video Action Detection with Token Dropout and Context\n Refinement","summary":" Streaming video clips with large-scale video tokens impede vision\ntransformers (ViTs) for efficient recognition, especially in video action\ndetection where sufficient spatiotemporal representations are required for\nprecise actor identification. In this work, we propose an end-to-end framework\nfor efficient video action detection (EVAD) based on vanilla ViTs. Our EVAD\nconsists of two specialized designs for video action detection. First, we\npropose a spatiotemporal token dropout from a keyframe-centric perspective. In\na video clip, we maintain all tokens from its keyframe, preserve tokens\nrelevant to actor motions from other frames, and drop out the remaining tokens\nin this clip. Second, we refine scene context by leveraging remaining tokens\nfor better recognizing actor identities. The region of interest (RoI) in our\naction detector is expanded into temporal domain. The captured spatiotemporal\nactor identity representations are refined via scene context in a decoder with\nthe attention mechanism. These two designs make our EVAD efficient while\nmaintaining accuracy, which is validated on three benchmark datasets (i.e.,\nAVA, UCF101-24, JHMDB). Compared to the vanilla ViT backbone, our EVAD reduces\nthe overall GFLOPs by 43% and improves real-time inference speed by 40% with no\nperformance degradation. Moreover, even at similar computational costs, our\nEVAD can improve the performance by 1.1 mAP with higher resolution inputs. Code\nis available at https://github.com/MCG-NJU/EVAD.\n","authors":["Lei Chen","Zhan Tong","Yibing Song","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2304.08451v3.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2308.14469v1","updated":"2023-08-28T10:15:57Z","published":"2023-08-28T10:15:57Z","title":"Pixel-Aware Stable Diffusion for Realistic Image Super-resolution and\n Personalized Stylization","summary":" Realistic image super-resolution (Real-ISR) aims to reproduce perceptually\nrealistic image details from a low-quality input. The commonly used adversarial\ntraining based Real-ISR methods often introduce unnatural visual artifacts and\nfail to generate realistic textures for natural scene images. The recently\ndeveloped generative stable diffusion models provide a potential solution to\nReal-ISR with pre-learned strong image priors. However, the existing methods\nalong this line either fail to keep faithful pixel-wise image structures or\nresort to extra skipped connections to reproduce details, which requires\nadditional training in image space and limits their extension to other related\ntasks in latent space such as image stylization. In this work, we propose a\npixel-aware stable diffusion (PASD) network to achieve robust Real-ISR as well\nas personalized stylization. In specific, a pixel-aware cross attention module\nis introduced to enable diffusion models perceiving image local structures in\npixel-wise level, while a degradation removal module is used to extract\ndegradation insensitive features to guide the diffusion process together with\nimage high level information. By simply replacing the base diffusion model with\na personalized one, our method can generate diverse stylized images without the\nneed to collect pairwise training data. PASD can be easily integrated into\nexisting diffusion models such as Stable Diffusion. Experiments on Real-ISR and\npersonalized stylization demonstrate the effectiveness of our proposed\napproach. The source code and models can be found at\n\\url{https://github.com/yangxy/PASD}.\n","authors":["Tao Yang","Peiran Ren","Xuansong Xie","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00247v3","updated":"2023-08-28T10:12:03Z","published":"2023-08-01T03:00:36Z","title":"Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive\n Review","summary":" The advent of deep learning has brought a revolutionary transformation to\nimage denoising techniques. However, the persistent challenge of acquiring\nnoise-clean pairs for supervised methods in real-world scenarios remains\nformidable, necessitating the exploration of more practical self-supervised\nimage denoising. This paper focuses on self-supervised image denoising methods\nthat offer effective solutions to address this challenge. Our comprehensive\nreview thoroughly analyzes the latest advancements in self-supervised image\ndenoising approaches, categorizing them into three distinct classes: General\nmethods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.\nFor each class, we provide a concise theoretical analysis along with their\npractical applications. To assess the effectiveness of these methods, we\npresent both quantitative and qualitative experimental results on various\ndatasets, utilizing classical algorithms as benchmarks. Additionally, we\ncritically discuss the current limitations of these methods and propose\npromising directions for future research. By offering a detailed overview of\nrecent developments in self-supervised image denoising, this review serves as\nan invaluable resource for researchers and practitioners in the field,\nfacilitating a deeper understanding of this emerging domain and inspiring\nfurther advancements.\n","authors":["Dan Zhang","Fangfang Zhou","Xiao Yang","Yuan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.00247v3.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2308.14466v1","updated":"2023-08-28T10:04:06Z","published":"2023-08-28T10:04:06Z","title":"Improving the performance of object detection by preserving label\n distribution","summary":" Object detection is a task that performs position identification and label\nclassification of objects in images or videos. The information obtained through\nthis process plays an essential role in various tasks in the field of computer\nvision. In object detection, the data utilized for training and validation\ntypically originate from public datasets that are well-balanced in terms of the\nnumber of objects ascribed to each class in an image. However, in real-world\nscenarios, handling datasets with much greater class imbalance, i.e., very\ndifferent numbers of objects for each class , is much more common, and this\nimbalance may reduce the performance of object detection when predicting unseen\ntest images. In our study, thus, we propose a method that evenly distributes\nthe classes in an image for training and validation, solving the class\nimbalance problem in object detection. Our proposed method aims to maintain a\nuniform class distribution through multi-label stratification. We tested our\nproposed method not only on public datasets that typically exhibit balanced\nclass distribution but also on custom datasets that may have imbalanced class\ndistribution. We found that our proposed method was more effective on datasets\ncontaining severe imbalance and less data. Our findings indicate that the\nproposed method can be effectively used on datasets with substantially\nimbalanced class distribution.\n","authors":["Heewon Lee","Sangtae Ahn"],"pdf_url":"https://arxiv.org/pdf/2308.14466v1.pdf","comment":"Code is available at\n https://github.com/leeheewon-01/YOLOstratifiedKFold/tree/main"},{"id":"http://arxiv.org/abs/2212.04740v3","updated":"2023-08-28T09:59:28Z","published":"2022-12-09T09:36:59Z","title":"Predicting Shape Development: a Riemannian Method","summary":" Predicting the future development of an anatomical shape from a single\nbaseline observation is a challenging task. But it can be essential for\nclinical decision-making. Research has shown that it should be tackled in\ncurved shape spaces, as (e.g., disease-related) shape changes frequently expose\nnonlinear characteristics. We thus propose a novel prediction method that\nencodes the whole shape in a Riemannian shape space. It then learns a simple\nprediction technique founded on hierarchical statistical modeling of\nlongitudinal training data. When applied to predict the future development of\nthe shape of the right hippocampus under Alzheimer's disease and to human body\nmotion, it outperforms deep learning-supported variants as well as\nstate-of-the-art.\n","authors":["Doğa Türkseven","Islem Rekik","Christoph von Tycowicz","Martin Hanik"],"pdf_url":"https://arxiv.org/pdf/2212.04740v3.pdf","comment":"new experiment with human motion data; fixed vertex-assignment bug in\n the prediction of the varifold-based method"},{"id":"http://arxiv.org/abs/2308.14461v1","updated":"2023-08-28T09:58:34Z","published":"2023-08-28T09:58:34Z","title":"Spatio-Temporal Analysis of Patient-Derived Organoid Videos Using Deep\n Learning for the Prediction of Drug Efficacy","summary":" Over the last ten years, Patient-Derived Organoids (PDOs) emerged as the most\nreliable technology to generate ex-vivo tumor avatars. PDOs retain the main\ncharacteristics of their original tumor, making them a system of choice for\npre-clinical and clinical studies. In particular, PDOs are attracting interest\nin the field of Functional Precision Medicine (FPM), which is based upon an\nex-vivo drug test in which living tumor cells (such as PDOs) from a specific\npatient are exposed to a panel of anti-cancer drugs. Currently, the Adenosine\nTriphosphate (ATP) based cell viability assay is the gold standard test to\nassess the sensitivity of PDOs to drugs. The readout is measured at the end of\nthe assay from a global PDO population and therefore does not capture single\nPDO responses and does not provide time resolution of drug effect. To this end,\nin this study, we explore for the first time the use of powerful large\nfoundation models for the automatic processing of PDO data. In particular, we\npropose a novel imaging-based high-throughput screening method to assess\nreal-time drug efficacy from a time-lapse microscopy video of PDOs. The\nrecently proposed SAM algorithm for segmentation and DINOv2 model are adapted\nin a comprehensive pipeline for processing PDO microscopy frames. Moreover, an\nattention mechanism is proposed for fusing temporal and spatial features in a\nmultiple instance learning setting to predict ATP. We report better results\nthan other non-time-resolved methods, indicating that the temporality of data\nis an important factor for the prediction of ATP. Extensive ablations shed\nlight on optimizing the experimental setting and automating the prediction both\nin real-time and for forecasting.\n","authors":["Leo Fillioux","Emilie Gontran","Jérôme Cartry","Jacques RR Mathieu","Sabrina Bedja","Alice Boilève","Paul-Henry Cournède","Fanny Jaulin","Stergios Christodoulidis","Maria Vakalopoulou"],"pdf_url":"https://arxiv.org/pdf/2308.14461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14448v1","updated":"2023-08-28T09:35:13Z","published":"2023-08-28T09:35:13Z","title":"ExpCLIP: Bridging Text and Facial Expressions via Semantic Alignment","summary":" The objective of stylized speech-driven facial animation is to create\nanimations that encapsulate specific emotional expressions. Existing methods\noften depend on pre-established emotional labels or facial expression\ntemplates, which may limit the necessary flexibility for accurately conveying\nuser intent. In this research, we introduce a technique that enables the\ncontrol of arbitrary styles by leveraging natural language as emotion prompts.\nThis technique presents benefits in terms of both flexibility and\nuser-friendliness. To realize this objective, we initially construct a\nText-Expression Alignment Dataset (TEAD), wherein each facial expression is\npaired with several prompt-like descriptions.We propose an innovative automatic\nannotation method, supported by Large Language Models (LLMs), to expedite the\ndataset construction, thereby eliminating the substantial expense of manual\nannotation. Following this, we utilize TEAD to train a CLIP-based model, termed\nExpCLIP, which encodes text and facial expressions into semantically aligned\nstyle embeddings. The embeddings are subsequently integrated into the facial\nanimation generator to yield expressive and controllable facial animations.\nGiven the limited diversity of facial emotions in existing speech-driven facial\nanimation training data, we further introduce an effective Expression Prompt\nAugmentation (EPA) mechanism to enable the animation generator to support\nunprecedented richness in style control. Comprehensive experiments illustrate\nthat our method accomplishes expressive facial animation generation and offers\nenhanced flexibility in effectively conveying the desired style.\n","authors":["Yicheng Zhong","Huawei Wei","Peiji Yang","Zhisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14437v1","updated":"2023-08-28T09:23:18Z","published":"2023-08-28T09:23:18Z","title":"Data-iterative Optimization Score Model for Stable Ultra-Sparse-View CT\n Reconstruction","summary":" Score-based generative models (SGMs) have gained prominence in sparse-view CT\nreconstruction for their precise sampling of complex distributions. In\nSGM-based reconstruction, data consistency in the score-based diffusion model\nensures close adherence of generated samples to observed data distribution,\ncrucial for improving image quality. Shortcomings in data consistency\ncharacterization manifest in three aspects. Firstly, data from the optimization\nprocess can lead to artifacts in reconstructed images. Secondly, it often\nneglects that the generation model and original data constraints are\nindependently completed, fragmenting unity. Thirdly, it predominantly focuses\non constraining intermediate results in the inverse sampling process, rather\nthan ideal real images. Thus, we propose an iterative optimization data scoring\nmodel. This paper introduces the data-iterative optimization score-based model\n(DOSM), integrating innovative data consistency into the Stochastic\nDifferential Equation, a valuable constraint for ultra-sparse-view CT\nreconstruction. The novelty of this data consistency element lies in its sole\nreliance on original measurement data to confine generation outcomes,\neffectively balancing measurement data and generative model constraints.\nAdditionally, we pioneer an inference strategy that traces back from current\niteration results to ideal truth, enhancing reconstruction stability. We\nleverage conventional iteration techniques to optimize DOSM updates.\nQuantitative and qualitative results from 23 views of numerical and clinical\ncardiac datasets demonstrate DOSM's superiority over other methods. Remarkably,\neven with 10 views, our method achieves excellent performance.\n","authors":["Weiwen Wu","Yanyang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14437v1.pdf","comment":"11 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.14419v1","updated":"2023-08-28T08:59:57Z","published":"2023-08-28T08:59:57Z","title":"Graph-based Asynchronous Event Processing for Rapid Object Recognition","summary":" Different from traditional video cameras, event cameras capture asynchronous\nevents stream in which each event encodes pixel location, trigger time, and the\npolarity of the brightness changes. In this paper, we introduce a novel\ngraph-based framework for event cameras, namely SlideGCN. Unlike some recent\ngraph-based methods that use groups of events as input, our approach can\nefficiently process data event-by-event, unlock the low latency nature of\nevents data while still maintaining the graph's structure internally. For fast\ngraph construction, we develop a radius search algorithm, which better exploits\nthe partial regular structure of event cloud against k-d tree based generic\nmethods. Experiments show that our method reduces the computational complexity\nup to 100 times with respect to current graph-based methods while keeping\nstate-of-the-art performance on object recognition. Moreover, we verify the\nsuperiority of event-wise processing with our method. When the state becomes\nstable, we can give a prediction with high confidence, thus making an early\nrecognition. Project page: \\url{https://zju3dv.github.io/slide_gcn/}.\n","authors":["Yijin Li","Han Zhou","Bangbang Yang","Ye Zhang","Zhaopeng Cui","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14419v1.pdf","comment":"Accepted to ICCV 2021. Project Page:\n https://zju3dv.github.io/slide_gcn/"},{"id":"http://arxiv.org/abs/2303.11917v2","updated":"2023-08-28T08:54:47Z","published":"2023-03-21T15:08:35Z","title":"Efficient Decision-based Black-box Patch Attacks on Video Recognition","summary":" Although Deep Neural Networks (DNNs) have demonstrated excellent performance,\nthey are vulnerable to adversarial patches that introduce perceptible and\nlocalized perturbations to the input. Generating adversarial patches on images\nhas received much attention, while adversarial patches on videos have not been\nwell investigated. Further, decision-based attacks, where attackers only access\nthe predicted hard labels by querying threat models, have not been well\nexplored on video models either, even if they are practical in real-world video\nrecognition scenes. The absence of such studies leads to a huge gap in the\nrobustness assessment for video models. To bridge this gap, this work first\nexplores decision-based patch attacks on video models. We analyze that the huge\nparameter space brought by videos and the minimal information returned by\ndecision-based models both greatly increase the attack difficulty and query\nburden. To achieve a query-efficient attack, we propose a spatial-temporal\ndifferential evolution (STDE) framework. First, STDE introduces target videos\nas patch textures and only adds patches on keyframes that are adaptively\nselected by temporal difference. Second, STDE takes minimizing the patch area\nas the optimization objective and adopts spatialtemporal mutation and crossover\nto search for the global optimum without falling into the local optimum.\nExperiments show STDE has demonstrated state-of-the-art performance in terms of\nthreat, efficiency and imperceptibility. Hence, STDE has the potential to be a\npowerful tool for evaluating the robustness of video recognition models.\n","authors":["Kaixun Jiang","Zhaoyu Chen","Hao Huang","Jiafeng Wang","Dingkang Yang","Bo Li","Yan Wang","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14418v1","updated":"2023-08-28T08:54:27Z","published":"2023-08-28T08:54:27Z","title":"Multi-Scale and Multi-Layer Contrastive Learning for Domain\n Generalization","summary":" During the past decade, deep neural networks have led to fast-paced progress\nand significant achievements in computer vision problems, for both academia and\nindustry. Yet despite their success, state-of-the-art image classification\napproaches fail to generalize well in previously unseen visual contexts, as\nrequired by many real-world applications. In this paper, we focus on this\ndomain generalization (DG) problem and argue that the generalization ability of\ndeep convolutional neural networks can be improved by taking advantage of\nmulti-layer and multi-scaled representations of the network. We introduce a\nframework that aims at improving domain generalization of image classifiers by\ncombining both low-level and high-level features at multiple scales, enabling\nthe network to implicitly disentangle representations in its latent space and\nlearn domain-invariant attributes of the depicted objects. Additionally, to\nfurther facilitate robust representation learning, we propose a novel objective\nfunction, inspired by contrastive learning, which aims at constraining the\nextracted representations to remain invariant under distribution shifts. We\ndemonstrate the effectiveness of our method by evaluating on the domain\ngeneralization datasets of PACS, VLCS, Office-Home and NICO. Through extensive\nexperimentation, we show that our model is able to surpass the performance of\nprevious DG methods and consistently produce competitive and state-of-the-art\nresults in all datasets.\n","authors":["Aristotelis Ballas","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2308.14418v1.pdf","comment":"Manuscript under review at: IEEE Transactions on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2308.14414v1","updated":"2023-08-28T08:51:20Z","published":"2023-08-28T08:51:20Z","title":"INF: Implicit Neural Fusion for LiDAR and Camera","summary":" Sensor fusion has become a popular topic in robotics. However, conventional\nfusion methods encounter many difficulties, such as data representation\ndifferences, sensor variations, and extrinsic calibration. For example, the\ncalibration methods used for LiDAR-camera fusion often require manual operation\nand auxiliary calibration targets. Implicit neural representations (INRs) have\nbeen developed for 3D scenes, and the volume density distribution involved in\nan INR unifies the scene information obtained by different types of sensors.\nTherefore, we propose implicit neural fusion (INF) for LiDAR and camera. INF\nfirst trains a neural density field of the target scene using LiDAR frames.\nThen, a separate neural color field is trained using camera images and the\ntrained neural density field. Along with the training process, INF both\nestimates LiDAR poses and optimizes extrinsic parameters. Our experiments\ndemonstrate the high accuracy and stable performance of the proposed method.\n","authors":["Shuyi Zhou","Shuxiang Xie","Ryoichi Ishikawa","Ken Sakurada","Masaki Onishi","Takeshi Oishi"],"pdf_url":"https://arxiv.org/pdf/2308.14414v1.pdf","comment":"Accepted to IROS 2023. (project page:\n https://ShuyiZhou495.github.io/inf-project-page/)"},{"id":"http://arxiv.org/abs/2308.14409v1","updated":"2023-08-28T08:47:06Z","published":"2023-08-28T08:47:06Z","title":"Steerable Conditional Diffusion for Out-of-Distribution Adaptation in\n Imaging Inverse Problems","summary":" Denoising diffusion models have emerged as the go-to framework for solving\ninverse problems in imaging. A critical concern regarding these models is their\nperformance on out-of-distribution (OOD) tasks, which remains an under-explored\nchallenge. Realistic reconstructions inconsistent with the measured data can be\ngenerated, hallucinating image features that are uniquely present in the\ntraining dataset. To simultaneously enforce data-consistency and leverage\ndata-driven priors, we introduce a novel sampling framework called Steerable\nConditional Diffusion. This framework adapts the denoising network specifically\nto the available measured data. Utilising our proposed method, we achieve\nsubstantial enhancements in OOD performance across diverse imaging modalities,\nadvancing the robust deployment of denoising diffusion models in real-world\napplications.\n","authors":["Riccardo Barbano","Alexander Denker","Hyungjin Chung","Tae Hoon Roh","Simon Arrdige","Peter Maass","Bangti Jin","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2308.14409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14400v1","updated":"2023-08-28T08:33:45Z","published":"2023-08-28T08:33:45Z","title":"Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer\n and NearFarMix Augmentation","summary":" In computer vision, depth estimation is crucial for domains like robotics,\nautonomous vehicles, augmented reality, and virtual reality. Integrating\nsemantics with depth enhances scene understanding through reciprocal\ninformation sharing. However, the scarcity of semantic information in datasets\nposes challenges. Existing convolutional approaches with limited local\nreceptive fields hinder the full utilization of the symbiotic potential between\ndepth and semantics. This paper introduces a dataset-invariant semi-supervised\nstrategy to address the scarcity of semantic information. It proposes the Depth\nSemantics Symbiosis module, leveraging the Symbiotic Transformer for achieving\ncomprehensive mutual awareness by information exchange within both local and\nglobal contexts. Additionally, a novel augmentation, NearFarMix is introduced\nto combat overfitting and compensate both depth-semantic tasks by strategically\nmerging regions from two images, generating diverse and structurally consistent\nsamples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI\ndatasets demonstrate the superiority of our proposed techniques in indoor and\noutdoor environments.\n","authors":["Md Awsafur Rahman","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2308.14400v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2202.06599v3","updated":"2023-08-28T08:27:30Z","published":"2022-02-14T10:40:51Z","title":"Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in\n First Trimester 3D Ultrasound","summary":" Segmentation and spatial alignment of ultrasound (US) imaging data acquired\nin the in first trimester are crucial for monitoring human embryonic growth and\ndevelopment throughout this crucial period of life. Current approaches are\neither manual or semi-automatic and are therefore very time-consuming and prone\nto errors. To automate these tasks, we propose a multi-atlas framework for\nautomatic segmentation and spatial alignment of the embryo using deep learning\nwith minimal supervision. Our framework learns to register the embryo to an\natlas, which consists of the US images acquired at a range of gestational age\n(GA), segmented and spatially aligned to a predefined standard orientation.\nFrom this, we can derive the segmentation of the embryo and put the embryo in\nstandard orientation. US images acquired at 8+0 till 12+6 weeks GA were used\nand eight subjects were selected as atlas. We evaluated different fusion\nstrategies to incorporate multiple atlases: 1) training the framework using\natlas images from a single subject, 2) training the framework with data of all\navailable atlases and 3) ensembling of the frameworks trained per subject. To\nevaluate the performance, we calculated the Dice score over the test set. We\nfound that training the framework using all available atlases outperformed\nensembling and gave similar results compared to the best of all frameworks\ntrained on a single subject. Furthermore, we found that selecting images from\nthe four atlases closest in GA out of all available atlases, regardless of the\nindividual quality, gave the best results with a median Dice score of 0.72. We\nconclude that our framework can accurately segment and spatially align the\nembryo in first trimester 3D US images and is robust for the variation in\nquality that existed in the available atlases.\n","authors":["W. A. P. Bastiaansen","M. Rousian","R. P. M. Steegers-Theunissen","W. J. Niessen","A. H. J. Koning","S. Klein"],"pdf_url":"https://arxiv.org/pdf/2202.06599v3.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html"},{"id":"http://arxiv.org/abs/2308.14397v1","updated":"2023-08-28T08:24:25Z","published":"2023-08-28T08:24:25Z","title":"Ensemble of Anchor-Free Models for Robust Bangla Document Layout\n Segmentation","summary":" In this research paper, we present an innovative system designed for the\npurpose of segmenting the layout of Bangla documents. Our methodology involves\nutilizing a sophisticated collection of YOLOv8 models, meticulously adapted for\nthe DL Sprint 2.0 - BUET CSE Fest 2023 Competition that centers around Bangla\ndocument layout segmentation. Our primary focus lies in elevating various\nelements of the task, including techniques like image augmentation, model\narchitecture, and the use of model ensembles. We intentionally lower the\nquality of a subset of document images to enhance the resilience of model\ntraining, consequently leading to an improvement in our cross-validation score.\nEmploying Bayesian optimization, we determine the optimal confidence and IoU\nthresholds for our model ensemble. Through our approach, we successfully\nshowcase the effectiveness of amalgamating anchor-free models to achieve robust\nlayout segmentation in Bangla documents.\n","authors":["U Mong Sain Chak","Md. Asib Rahman"],"pdf_url":"https://arxiv.org/pdf/2308.14397v1.pdf","comment":"4 pages, 5 figures, 6 Tables"},{"id":"http://arxiv.org/abs/2308.14395v1","updated":"2023-08-28T08:20:30Z","published":"2023-08-28T08:20:30Z","title":"UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for\n Temporal Forgery Localization","summary":" The emergence of artificial intelligence-generated content (AIGC) has raised\nconcerns about the authenticity of multimedia content in various fields.\nHowever, existing research for forgery content detection has focused mainly on\nbinary classification tasks of complete videos, which has limited applicability\nin industrial settings. To address this gap, we propose UMMAFormer, a novel\nuniversal transformer framework for temporal forgery localization (TFL) that\npredicts forgery segments with multimodal adaptation. Our approach introduces a\nTemporal Feature Abnormal Attention (TFAA) module based on temporal feature\nreconstruction to enhance the detection of temporal differences. We also design\na Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the\nFeature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the\nproposed method, we contribute a novel Temporal Video Inpainting Localization\n(TVIL) dataset specifically tailored for video inpainting scenes. Our\nexperiments show that our approach achieves state-of-the-art performance on\nbenchmark datasets, including Lav-DF, TVIL, and Psynd, significantly\noutperforming previous methods. The code and data are available at\nhttps://github.com/ymhzyj/UMMAFormer/.\n","authors":["Rui Zhang","Hongxia Wang","Mingshan Du","Hanqing Liu","Yang Zhou","Qiang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.14395v1.pdf","comment":"11 pages, 8 figures, 66 references. This paper has been accepted for\n ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.14392v1","updated":"2023-08-28T08:15:43Z","published":"2023-08-28T08:15:43Z","title":"1st Place Solution for the 5th LSVOS Challenge: Video Instance\n Segmentation","summary":" Video instance segmentation is a challenging task that serves as the\ncornerstone of numerous downstream applications, including video editing and\nautonomous driving. In this report, we present further improvements to the SOTA\nVIS method, DVIS. First, we introduce a denoising training strategy for the\ntrainable tracker, allowing it to achieve more stable and accurate object\ntracking in complex and long videos. Additionally, we explore the role of\nvisual foundation models in video instance segmentation. By utilizing a frozen\nVIT-L model pre-trained by DINO v2, DVIS demonstrates remarkable performance\nimprovements. With these enhancements, our method achieves 57.9 AP and 56.0 AP\nin the development and test phases, respectively, and ultimately ranked 1st in\nthe VIS track of the 5th LSVOS Challenge. The code will be available at\nhttps://github.com/zhang-tao-whu/DVIS.\n","authors":["Tao Zhang","Xingye Tian","Yikang Zhou","Yu Wu","Shunping Ji","Cilin Yan","Xuebo Wang","Xin Tao","Yuan Zhang","Pengfei Wan"],"pdf_url":"https://arxiv.org/pdf/2308.14392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14391v1","updated":"2023-08-28T08:14:20Z","published":"2023-08-28T08:14:20Z","title":"FIRE: Food Image to REcipe generation","summary":" Food computing has emerged as a prominent multidisciplinary field of research\nin recent years. An ambitious goal of food computing is to develop end-to-end\nintelligent systems capable of autonomously producing recipe information for a\nfood image. Current image-to-recipe methods are retrieval-based and their\nsuccess depends heavily on the dataset size and diversity, as well as the\nquality of learned embeddings. Meanwhile, the emergence of powerful\nattention-based vision and language models presents a promising avenue for\naccurate and generalizable recipe generation, which has yet to be extensively\nexplored. This paper proposes FIRE, a novel multimodal methodology tailored to\nrecipe generation in the food computing domain, which generates the food title,\ningredients, and cooking instructions based on input food images. FIRE\nleverages the BLIP model to generate titles, utilizes a Vision Transformer with\na decoder for ingredient extraction, and employs the T5 model to generate\nrecipes incorporating titles and ingredients as inputs. We showcase two\npractical applications that can benefit from integrating FIRE with large\nlanguage model prompting: recipe customization to fit recipes to user\npreferences and recipe-to-code transformation to enable automated cooking\nprocesses. Our experimental findings validate the efficacy of our proposed\napproach, underscoring its potential for future advancements and widespread\nadoption in food computing.\n","authors":["Prateek Chhikara","Dhiraj Chaurasia","Yifan Jiang","Omkar Masur","Filip Ilievski"],"pdf_url":"https://arxiv.org/pdf/2308.14391v1.pdf","comment":"5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2303.11325v2","updated":"2023-08-28T08:00:52Z","published":"2023-03-20T17:59:03Z","title":"GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling\n for Multi-view 3D Understanding","summary":" Multi-view camera-based 3D detection is a challenging problem in computer\nvision. Recent works leverage a pretrained LiDAR detection model to transfer\nknowledge to a camera-based student network. However, we argue that there is a\nmajor domain gap between the LiDAR BEV features and the camera-based BEV\nfeatures, as they have different characteristics and are derived from different\nsources. In this paper, we propose Geometry Enhanced Masked Image Modeling\n(GeoMIM) to transfer the knowledge of the LiDAR model in a pretrain-finetune\nparadigm for improving the multi-view camera-based 3D detection. GeoMIM is a\nmulti-camera vision transformer with Cross-View Attention (CVA) blocks that\nuses LiDAR BEV features encoded by the pretrained BEV model as learning\ntargets. During pretraining, GeoMIM's decoder has a semantic branch completing\ndense perspective-view features and the other geometry branch reconstructing\ndense perspective-view depth maps. The depth branch is designed to be\ncamera-aware by inputting the camera's parameters for better transfer\ncapability. Extensive results demonstrate that GeoMIM outperforms existing\nmethods on nuScenes benchmark, achieving state-of-the-art performance for\ncamera-based 3D object detection and 3D segmentation. Code and pretrained\nmodels are available at https://github.com/Sense-X/GeoMIM.\n","authors":["Jihao Liu","Tai Wang","Boxiao Liu","Qihang Zhang","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2303.11325v2.pdf","comment":"Release code: https://github.com/Sense-X/GeoMIM"},{"id":"http://arxiv.org/abs/2212.12734v3","updated":"2023-08-28T07:58:48Z","published":"2022-12-24T13:35:31Z","title":"DDH-QA: A Dynamic Digital Humans Quality Assessment Database","summary":" In recent years, large amounts of effort have been put into pushing forward\nthe real-world application of dynamic digital human (DDH). However, most\ncurrent quality assessment research focuses on evaluating static 3D models and\nusually ignores motion distortions. Therefore, in this paper, we construct a\nlarge-scale dynamic digital human quality assessment (DDH-QA) database with\ndiverse motion content as well as multiple distortions to comprehensively study\nthe perceptual quality of DDHs. Both model-based distortion (noise,\ncompression) and motion-based distortion (binding error, motion unnaturalness)\nare taken into consideration. Ten types of common motion are employed to drive\nthe DDHs and a total of 800 DDHs are generated in the end. Afterward, we render\nthe video sequences of the distorted DDHs as the evaluation media and carry out\na well-controlled subjective experiment. Then a benchmark experiment is\nconducted with the state-of-the-art video quality assessment (VQA) methods and\nthe experimental results show that existing VQA methods are limited in\nassessing the perceptual loss of DDHs.\n","authors":["Zicheng Zhang","Yingjie Zhou","Wei Sun","Wei Lu","Xiongkuo Min","Yu Wang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2212.12734v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14383v1","updated":"2023-08-28T07:56:13Z","published":"2023-08-28T07:56:13Z","title":"Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a\n Light-Weight ToF Sensor","summary":" Light-weight time-of-flight (ToF) depth sensors are compact and\ncost-efficient, and thus widely used on mobile devices for tasks such as\nautofocus and obstacle detection. However, due to the sparse and noisy depth\nmeasurements, these sensors have rarely been considered for dense geometry\nreconstruction. In this work, we present the first dense SLAM system with a\nmonocular camera and a light-weight ToF sensor. Specifically, we propose a\nmulti-modal implicit scene representation that supports rendering both the\nsignals from the RGB camera and light-weight ToF sensor which drives the\noptimization by comparing with the raw sensor inputs. Moreover, in order to\nguarantee successful pose tracking and reconstruction, we exploit a predicted\ndepth as an intermediate supervision and develop a coarse-to-fine optimization\nstrategy for efficient learning of the implicit representation. At last, the\ntemporal information is explicitly exploited to deal with the noisy signals\nfrom light-weight ToF sensors to improve the accuracy and robustness of the\nsystem. Experiments demonstrate that our system well exploits the signals of\nlight-weight ToF sensors and achieves competitive results both on camera\ntracking and dense scene reconstruction. Project page:\n\\url{https://zju3dv.github.io/tof_slam/}.\n","authors":["Xinyang Liu","Yijin Li","Yanbin Teng","Hujun Bao","Guofeng Zhang","Yinda Zhang","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2308.14383v1.pdf","comment":"Accepted to ICCV 2023 (Oral). Project Page:\n https://zju3dv.github.io/tof_slam/"},{"id":"http://arxiv.org/abs/2308.14378v1","updated":"2023-08-28T07:50:04Z","published":"2023-08-28T07:50:04Z","title":"GKGNet: Group K-Nearest Neighbor based Graph Convolutional Network for\n Multi-Label Image Recognition","summary":" Multi-Label Image Recognition (MLIR) is a challenging task that aims to\npredict multiple object labels in a single image while modeling the complex\nrelationships between labels and image regions. Although convolutional neural\nnetworks and vision transformers have succeeded in processing images as regular\ngrids of pixels or patches, these representations are sub-optimal for capturing\nirregular and discontinuous regions of interest. In this work, we present the\nfirst fully graph convolutional model, Group K-nearest neighbor based Graph\nconvolutional Network (GKGNet), which models the connections between semantic\nlabel embeddings and image patches in a flexible and unified graph structure.\nTo address the scale variance of different objects and to capture information\nfrom multiple perspectives, we propose the Group KGCN module for dynamic graph\nconstruction and message passing. Our experiments demonstrate that GKGNet\nachieves state-of-the-art performance with significantly lower computational\ncosts on the challenging multi-label datasets, \\ie MS-COCO and VOC2007\ndatasets. We will release the code and models to facilitate future research in\nthis area.\n","authors":["Ruijie Yao","Sheng Jin","Lumin Xu","Wang Zeng","Wentao Liu","Chen Qian","Ping Luo","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2308.14378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14371v1","updated":"2023-08-28T07:40:48Z","published":"2023-08-28T07:40:48Z","title":"SuperUDF: Self-supervised UDF Estimation for Surface Reconstruction","summary":" Learning-based surface reconstruction based on unsigned distance functions\n(UDF) has many advantages such as handling open surfaces. We propose SuperUDF,\na self-supervised UDF learning which exploits a learned geometry prior for\nefficient training and a novel regularization for robustness to sparse\nsampling. The core idea of SuperUDF draws inspiration from the classical\nsurface approximation operator of locally optimal projection (LOP). The key\ninsight is that if the UDF is estimated correctly, the 3D points should be\nlocally projected onto the underlying surface following the gradient of the\nUDF. Based on that, a number of inductive biases on UDF geometry and a\npre-learned geometry prior are devised to learn UDF estimation efficiently. A\nnovel regularization loss is proposed to make SuperUDF robust to sparse\nsampling. Furthermore, we also contribute a learning-based mesh extraction from\nthe estimated UDFs. Extensive evaluations demonstrate that SuperUDF outperforms\nthe state of the arts on several public datasets in terms of both quality and\nefficiency. Code will be released after accteptance.\n","authors":["Hui Tian","Chenyang Zhu","Yifei Shi","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14369v1","updated":"2023-08-28T07:35:21Z","published":"2023-08-28T07:35:21Z","title":"Improving Lesion Volume Measurements on Digital Mammograms","summary":" Lesion volume is an important predictor for prognosis in breast cancer. We\nmake a step towards a more accurate lesion volume measurement on digital\nmammograms by developing a model that allows to estimate lesion volumes on\nprocessed mammograms, which are the images routinely used by radiologists in\nclinical practice as well as in breast cancer screening and are available in\nmedical centers. Processed mammograms are obtained from raw mammograms, which\nare the X-ray data coming directly from the scanner, by applying certain\nvendor-specific non-linear transformations. At the core of our volume\nestimation method is a physics-based algorithm for measuring lesion volumes on\nraw mammograms. We subsequently extend this algorithm to processed mammograms\nvia a deep learning image-to-image translation model that produces synthetic\nraw mammograms from processed mammograms in a multi-vendor setting. We assess\nthe reliability and validity of our method using a dataset of 1778 mammograms\nwith an annotated mass. Firstly, we investigate the correlations between lesion\nvolumes computed from mediolateral oblique and craniocaudal views, with a\nresulting Pearson correlation of 0.93 [95% confidence interval (CI) 0.92 -\n0.93]. Secondly, we compare the resulting lesion volumes from true and\nsynthetic raw data, with a resulting Pearson correlation of 0.998 [95% CI 0.998\n- 0.998] . Finally, for a subset of 100 mammograms with a malign mass and\nconcurrent MRI examination available, we analyze the agreement between lesion\nvolume on mammography and MRI, resulting in an intraclass correlation\ncoefficient of 0.81 [95% CI 0.73 - 0.87] for consistency and 0.78 [95% CI 0.66\n- 0.86] for absolute agreement. In conclusion, we developed an algorithm to\nmeasure mammographic lesion volume that reached excellent reliability and good\nvalidity, when using MRI as ground truth.\n","authors":["Nikita Moriakov","Jim Peters","Ritse Mann","Nico Karssemeijer","Jos van Dijck","Mireille Broeders","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2308.14369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07011v4","updated":"2023-08-28T07:29:03Z","published":"2023-05-11T17:53:29Z","title":"Region-Aware Pretraining for Open-Vocabulary Object Detection with\n Vision Transformers","summary":" We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a\ncontrastive image-text pretraining recipe to bridge the gap between image-level\npretraining and open-vocabulary object detection. At the pretraining phase, we\npropose to randomly crop and resize regions of positional embeddings instead of\nusing the whole image positional embeddings. This better matches the use of\npositional embeddings at region-level in the detection finetuning phase. In\naddition, we replace the common softmax cross entropy loss in contrastive\nlearning with focal loss to better learn the informative yet difficult\nexamples. Finally, we leverage recent advances in novel object proposals to\nimprove open-vocabulary detection finetuning. We evaluate our full model on the\nLVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer.\nRO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best\nexisting approach by +7.8 points in addition to competitive zero-shot transfer\ndetection. Surprisingly, RO-ViT improves the image-level representation as well\nand achieves the state of the art on 9 out of 12 metrics on COCO and Flickr\nimage-text retrieval benchmarks, outperforming competitive approaches with\nlarger models.\n","authors":["Dahun Kim","Anelia Angelova","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2305.07011v4.pdf","comment":"CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B\n result"},{"id":"http://arxiv.org/abs/2302.06039v2","updated":"2023-08-28T07:19:48Z","published":"2023-02-13T00:46:34Z","title":"Predicting Class Distribution Shift for Reliable Domain Adaptive Object\n Detection","summary":" Unsupervised Domain Adaptive Object Detection (UDA-OD) uses unlabelled data\nto improve the reliability of robotic vision systems in open-world\nenvironments. Previous approaches to UDA-OD based on self-training have been\neffective in overcoming changes in the general appearance of images. However,\nshifts in a robot's deployment environment can also impact the likelihood that\ndifferent objects will occur, termed class distribution shift. Motivated by\nthis, we propose a framework for explicitly addressing class distribution shift\nto improve pseudo-label reliability in self-training. Our approach uses the\ndomain invariance and contextual understanding of a pre-trained joint vision\nand language model to predict the class distribution of unlabelled data. By\naligning the class distribution of pseudo-labels with this prediction, we\nprovide weak supervision of pseudo-label accuracy. To further account for low\nquality pseudo-labels early in self-training, we propose an approach to\ndynamically adjust the number of pseudo-labels per image based on model\nconfidence. Our method outperforms state-of-the-art approaches on several\nbenchmarks, including a 4.7 mAP improvement when facing challenging class\ndistribution shift.\n","authors":["Nicolas Harvey Chapman","Feras Dayoub","Will Browne","Christopher Lehnert"],"pdf_url":"https://arxiv.org/pdf/2302.06039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10425v2","updated":"2023-08-28T07:06:35Z","published":"2023-02-21T03:34:15Z","title":"Instance-incremental Scene Graph Generation from Real-world Point Clouds\n via Normalizing Flows","summary":" This work introduces a new task of instance-incremental scene graph\ngeneration: Given a scene of the point cloud, representing it as a graph and\nautomatically increasing novel instances. A graph denoting the object layout of\nthe scene is finally generated. It is an important task since it helps to guide\nthe insertion of novel 3D objects into a real-world scene in vision-based\napplications like augmented reality. It is also challenging because the\ncomplexity of the real-world point cloud brings difficulties in learning object\nlayout experiences from the observation data (non-empty rooms with labeled\nsemantics). We model this task as a conditional generation problem and propose\na 3D autoregressive framework based on normalizing flows (3D-ANF) to address\nit. First, we represent the point cloud as a graph by extracting the label\nsemantics and contextual relationships. Next, a model based on normalizing\nflows is introduced to map the conditional generation of graphic elements into\nthe Gaussian process. The mapping is invertible. Thus, the real-world\nexperiences represented in the observation data can be modeled in the training\nphase, and novel instances can be autoregressively generated based on the\nGaussian process in the testing phase. To evaluate the performance of our\nmethod sufficiently, we implement this new task on the indoor benchmark dataset\n3DSSG-O27R16 and our newly proposed graphical dataset of outdoor scenes GPL3D.\nExperiments show that our method generates reliable novel graphs from the\nreal-world point cloud and achieves state-of-the-art performance on the\ndatasets.\n","authors":["Chao Qi","Jianqin Yin","Jinghang Xu","Pengxiang Ding"],"pdf_url":"https://arxiv.org/pdf/2302.10425v2.pdf","comment":"Accepted by IEEE TCSVT. The supplementary material is available in\n the media column of the journal version of the article"},{"id":"http://arxiv.org/abs/2308.14334v1","updated":"2023-08-28T06:25:40Z","published":"2023-08-28T06:25:40Z","title":"MetaWeather: Few-Shot Weather-Degraded Image Restoration via Degradation\n Pattern Matching","summary":" Real-world vision tasks frequently suffer from the appearance of adverse\nweather conditions including rain, fog, snow, and raindrops in captured images.\nRecently, several generic methods for restoring weather-degraded images have\nbeen proposed, aiming to remove multiple types of adverse weather effects\npresent in the images. However, these methods have considered weather as\ndiscrete and mutually exclusive variables, leading to failure in generalizing\nto unforeseen weather conditions beyond the scope of the training data, such as\nthe co-occurrence of rain, fog, and raindrops. To this end, weather-degraded\nimage restoration models should have flexible adaptability to the current\nunknown weather condition to ensure reliable and optimal performance. The\nadaptation method should also be able to cope with data scarcity for real-world\nadaptation. This paper proposes MetaWeather, a few-shot weather-degraded image\nrestoration method for arbitrary weather conditions. For this, we devise the\ncore piece of MetaWeather, coined Degradation Pattern Matching Module (DPMM),\nwhich leverages representations from a few-shot support set by matching\nfeatures between input and sample images under new weather conditions. In\naddition, we build meta-knowledge with episodic meta-learning on top of our\nMetaWeather architecture to provide flexible adaptability. In the meta-testing\nphase, we adopt a parameter-efficient fine-tuning method to preserve the\nprebuilt knowledge and avoid the overfitting problem. Experiments on the BID\nTask II.A dataset show our method achieves the best performance on PSNR and\nSSIM compared to state-of-the-art image restoration methods. Code is available\nat (TBA).\n","authors":["Youngrae Kim","Younggeol Cho","Thanh-Tung Nguyen","Dongman Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14334v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.14332v1","updated":"2023-08-28T06:22:10Z","published":"2023-08-28T06:22:10Z","title":"Attention-Guided Lidar Segmentation and Odometry Using Image-to-Point\n Cloud Saliency Transfer","summary":" LiDAR odometry estimation and 3D semantic segmentation are crucial for\nautonomous driving, which has achieved remarkable advances recently. However,\nthese tasks are challenging due to the imbalance of points in different\nsemantic categories for 3D semantic segmentation and the influence of dynamic\nobjects for LiDAR odometry estimation, which increases the importance of using\nrepresentative/salient landmarks as reference points for robust feature\nlearning. To address these challenges, we propose a saliency-guided approach\nthat leverages attention information to improve the performance of LiDAR\nodometry estimation and semantic segmentation models. Unlike in the image\ndomain, only a few studies have addressed point cloud saliency information due\nto the lack of annotated training data. To alleviate this, we first present a\nuniversal framework to transfer saliency distribution knowledge from color\nimages to point clouds, and use this to construct a pseudo-saliency dataset\n(i.e. FordSaliency) for point clouds. Then, we adopt point cloud-based\nbackbones to learn saliency distribution from pseudo-saliency labels, which is\nfollowed by our proposed SalLiDAR module. SalLiDAR is a saliency-guided 3D\nsemantic segmentation model that integrates saliency information to improve\nsegmentation performance. Finally, we introduce SalLONet, a self-supervised\nsaliency-guided LiDAR odometry network that uses the semantic and saliency\npredictions of SalLiDAR to achieve better odometry estimation. Our extensive\nexperiments on benchmark datasets demonstrate that the proposed SalLiDAR and\nSalLONet models achieve state-of-the-art performance against existing methods,\nhighlighting the effectiveness of image-to-LiDAR saliency knowledge transfer.\nSource code will be available at https://github.com/nevrez/SalLONet.\n","authors":["Guanqun Ding","Nevrez Imamoglu","Ali Caglayan","Masahiro Murakawa","Ryosuke Nakamura"],"pdf_url":"https://arxiv.org/pdf/2308.14332v1.pdf","comment":"33 pages, 12 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2308.14324v1","updated":"2023-08-28T06:09:25Z","published":"2023-08-28T06:09:25Z","title":"CPFES: Physical Fitness Evaluation Based on Canadian Agility and\n Movement Skill Assessment","summary":" In recent years, the assessment of fundamental movement skills integrated\nwith physical education has focused on both teaching practice and the\nfeasibility of assessment. The object of assessment has shifted from multiple\nages to subdivided ages, while the content of assessment has changed from\ncomplex and time-consuming to concise and efficient. Therefore, we apply deep\nlearning to physical fitness evaluation, we propose a system based on the\nCanadian Agility and Movement Skill Assessment (CAMSA) Physical Fitness\nEvaluation System (CPFES), which evaluates children's physical fitness based on\nCAMSA, and gives recommendations based on the scores obtained by CPFES to help\nchildren grow. We have designed a landmark detection module and a pose\nestimation module, and we have also designed a pose evaluation module for the\nCAMSA criteria that can effectively evaluate the actions of the child being\ntested. Our experimental results demonstrate the high accuracy of the proposed\nsystem.\n","authors":["Pengcheng Dong","Xiaojin Mao","Lixia Fan","Wenbo Wan","Jiande Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14322v1","updated":"2023-08-28T06:05:23Z","published":"2023-08-28T06:05:23Z","title":"Machine Unlearning Methodology base on Stochastic Teacher Network","summary":" The rise of the phenomenon of the \"right to be forgotten\" has prompted\nresearch on machine unlearning, which grants data owners the right to actively\nwithdraw data that has been used for model training, and requires the\nelimination of the contribution of that data to the model. A simple method to\nachieve this is to use the remaining data to retrain the model, but this is not\nacceptable for other data owners who continue to participate in training.\nExisting machine unlearning methods have been found to be ineffective in\nquickly removing knowledge from deep learning models. This paper proposes using\na stochastic network as a teacher to expedite the mitigation of the influence\ncaused by forgotten data on the model. We performed experiments on three\ndatasets, and the findings demonstrate that our approach can efficiently\nmitigate the influence of target data on the model within a single epoch. This\nallows for one-time erasure and reconstruction of the model, and the\nreconstruction model achieves the same performance as the retrained model.\n","authors":["Xulong Zhang","Jianzong Wang","Ning Cheng","Yifu Sun","Chuanyao Zhang","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14322v1.pdf","comment":"Accepted by 19th International Conference on Advanced Data Mining and\n Applications. (ADMA 2023)"},{"id":"http://arxiv.org/abs/2304.02163v2","updated":"2023-08-28T06:03:39Z","published":"2023-04-04T23:41:20Z","title":"GINA-3D: Learning to Generate Implicit Neural Assets in the Wild","summary":" Modeling the 3D world from sensor data for simulation is a scalable way of\ndeveloping testing and validation environments for robotic learning problems\nsuch as autonomous driving. However, manually creating or re-creating\nreal-world-like environments is difficult, expensive, and not scalable. Recent\ngenerative model techniques have shown promising progress to address such\nchallenges by learning 3D assets using only plentiful 2D images -- but still\nsuffer limitations as they leverage either human-curated image datasets or\nrenderings from manually-created synthetic 3D environments. In this paper, we\nintroduce GINA-3D, a generative model that uses real-world driving data from\ncamera and LiDAR sensors to create realistic 3D implicit neural assets of\ndiverse vehicles and pedestrians. Compared to the existing image datasets, the\nreal-world driving setting poses new challenges due to occlusions,\nlighting-variations and long-tail distributions. GINA-3D tackles these\nchallenges by decoupling representation learning and generative modeling into\ntwo stages with a learned tri-plane latent structure, inspired by recent\nadvances in generative modeling of images. To evaluate our approach, we\nconstruct a large-scale object-centric dataset containing over 1.2M images of\nvehicles and pedestrians from the Waymo Open Dataset, and a new set of 80K\nimages of long-tail instances such as construction equipment, garbage trucks,\nand cable cars. We compare our model with existing approaches and demonstrate\nthat it achieves state-of-the-art performance in quality and diversity for both\ngenerated images and geometries.\n","authors":["Bokui Shen","Xinchen Yan","Charles R. Qi","Mahyar Najibi","Boyang Deng","Leonidas Guibas","Yin Zhou","Dragomir Anguelov"],"pdf_url":"https://arxiv.org/pdf/2304.02163v2.pdf","comment":"Accepted by CVPR 2023; Our WOD-ObjectAsset can be accessed through\n waymo.com/open"},{"id":"http://arxiv.org/abs/2211.03989v3","updated":"2023-08-28T05:56:47Z","published":"2022-11-08T04:00:23Z","title":"$BT^2$: Backward-compatible Training with Basis Transformation","summary":" Modern retrieval system often requires recomputing the representation of\nevery piece of data in the gallery when updating to a better representation\nmodel. This process is known as backfilling and can be especially costly in the\nreal world where the gallery often contains billions of samples. Recently,\nresearchers have proposed the idea of Backward Compatible Training (BCT) where\nthe new representation model can be trained with an auxiliary loss to make it\nbackward compatible with the old representation. In this way, the new\nrepresentation can be directly compared with the old representation, in\nprinciple avoiding the need for any backfilling. However, followup work shows\nthat there is an inherent tradeoff where a backward compatible representation\nmodel cannot simultaneously maintain the performance of the new model itself.\nThis paper reports our ``not-so-surprising'' finding that adding extra\ndimensions to the representation can help here. However, we also found that\nnaively increasing the dimension of the representation did not work. To deal\nwith this, we propose Backward-compatible Training with a novel Basis\nTransformation ($BT^2$). A basis transformation (BT) is basically a learnable\nset of parameters that applies an orthonormal transformation. Such a\ntransformation possesses an important property whereby the original information\ncontained in its input is retained in its output. We show in this paper how a\nBT can be utilized to add only the necessary amount of additional dimensions.\nWe empirically verify the advantage of $BT^2$ over other state-of-the-art\nmethods in a wide range of settings. We then further extend $BT^2$ to other\nchallenging yet more practical settings, including significant change in model\narchitecture (CNN to Transformers), modality change, and even a series of\nupdates in the model architecture mimicking the evolution of deep learning\nmodels.\n","authors":["Yifei Zhou","Zilu Li","Abhinav Shrivastava","Hengshuang Zhao","Antonio Torralba","Taipeng Tian","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2211.03989v3.pdf","comment":"iccv2023 camera ready"},{"id":"http://arxiv.org/abs/2308.14316v1","updated":"2023-08-28T05:38:43Z","published":"2023-08-28T05:38:43Z","title":"UniPT: Universal Parallel Tuning for Transfer Learning with Efficient\n Parameter and Memory","summary":" Fine-tuning pre-trained models has emerged as a powerful technique in\nnumerous domains, owing to its ability to leverage enormous pre-existing\nknowledge and achieve remarkable performance on downstream tasks. However,\nupdating the parameters of entire networks is computationally intensive.\nAlthough state-of-the-art parameter-efficient transfer learning (PETL) methods\nsignificantly reduce the trainable parameters and storage demand, almost all of\nthem still need to back-propagate the gradients through large pre-trained\nnetworks. This memory-extensive characteristic extremely limits the\napplicability of PETL methods in real-world scenarios. To this end, we propose\na new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT).\nSpecifically, we facilitate the transfer process via a lightweight learnable\nparallel network, which consists of two modules: 1) A parallel interaction\nmodule that decouples the inherently sequential connections and processes the\nintermediate activations detachedly of the pre-trained network. 2) A confidence\naggregation module that learns optimal strategies adaptively for integrating\ncross-layer features. We evaluate UniPT with different backbones (e.g.,\nVSE$\\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging\nvision-and-language tasks (i.e., image-text retrieval, video-text retrieval,\nvisual question answering, compositional question answering, and visual\ngrounding). Extensive ablations on ten datasets have validated that our UniPT\ncan not only dramatically reduce memory consumption and outperform the best\nmemory-efficient competitor, but also achieve higher performance than existing\nPETL methods in a low-memory scenario on different architectures. Our code is\npublicly available at: https://github.com/Paranioar/UniPT.\n","authors":["Haiwen Diao","Bo Wan","Ying Zhang","Xu Jia","Huchuan Lu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14316v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14312v1","updated":"2023-08-28T05:29:59Z","published":"2023-08-28T05:29:59Z","title":"Local-Global Pseudo-label Correction for Source-free Domain Adaptive\n Medical Image Segmentation","summary":" Domain shift is a commonly encountered issue in medical imaging solutions,\nprimarily caused by variations in imaging devices and data sources. To mitigate\nthis problem, unsupervised domain adaptation techniques have been employed.\nHowever, concerns regarding patient privacy and potential degradation of image\nquality have led to an increased focus on source-free domain adaptation. In\nthis study, we address the issue of false labels in self-training based\nsource-free domain adaptive medical image segmentation methods. To correct\nerroneous pseudo-labels, we propose a novel approach called the local-global\npseudo-label correction (LGDA) method for source-free domain adaptive medical\nimage segmentation. Our method consists of two components: An offline local\ncontext-based pseudo-label correction method that utilizes local context\nsimilarity in image space. And an online global pseudo-label correction method\nbased on class prototypes, which corrects erroneously predicted pseudo-labels\nby considering the relative distance between pixel-wise feature vectors and\nprototype vectors. We evaluate the performance of our method on three benchmark\nfundus image datasets for optic disc and cup segmentation. Our method achieves\nsuperior performance compared to the state-of-the-art approaches, even without\nusing of any source data.\n","authors":["Yanyu Ye","Zhengxi Zhang","Chunna Tianb","Wei wei"],"pdf_url":"https://arxiv.org/pdf/2308.14312v1.pdf","comment":"30 pages,7 figures"},{"id":"http://arxiv.org/abs/2202.13799v3","updated":"2023-08-28T04:52:53Z","published":"2022-02-28T13:48:41Z","title":"One-shot Ultra-high-Resolution Generative Adversarial Network That\n Synthesizes 16K Images On A Single GPU","summary":" We propose a one-shot ultra-high-resolution generative adversarial network\n(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images\nfrom a single training image and is trainable on a single consumer GPU. OUR-GAN\ngenerates an initial image that is visually plausible and varied in shape at\nlow resolution, and then gradually increases the resolution by adding detail\nthrough super-resolution. Since OUR-GAN learns from a real\nultra-high-resolution (UHR) image, it can synthesize large shapes with fine\ndetails and long-range coherence, which is difficult to achieve with\nconventional generative models that rely on the patch distribution learned from\nrelatively small images. OUR-GAN can synthesize high-quality 16K images with\n12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR\nimage part by part through seamless subregion-wise super-resolution.\nAdditionally, OUR-GAN improves visual coherence while maintaining diversity by\napplying vertical positional convolution. In experiments on the ST4K and RAISE\ndatasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity\ncompared with the baseline one-shot synthesis models. To the best of our\nknowledge, OUR-GAN is the first one-shot image synthesizer that generates\nnon-repetitive UHR images on a single consumer GPU. The synthesized image\nsamples are presented at https://our-gan.github.io.\n","authors":["Junseok Oh","Donghwee Yoon","Injung Kim"],"pdf_url":"https://arxiv.org/pdf/2202.13799v3.pdf","comment":"36 pages, 26 figures"},{"id":"http://arxiv.org/abs/2303.12091v2","updated":"2023-08-28T04:50:57Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15699v2","updated":"2023-08-28T04:46:01Z","published":"2023-03-28T03:05:25Z","title":"Enhancing Breast Cancer Risk Prediction by Incorporating Prior Images","summary":" Recently, deep learning models have shown the potential to predict breast\ncancer risk and enable targeted screening strategies, but current models do not\nconsider the change in the breast over time. In this paper, we present a new\nmethod, PRIME+, for breast cancer risk prediction that leverages prior\nmammograms using a transformer decoder, outperforming a state-of-the-art risk\nprediction method that only uses mammograms from a single time point. We\nvalidate our approach on a dataset with 16,113 exams and further demonstrate\nthat it effectively captures patterns of changes from prior mammograms, such as\nchanges in breast density, resulting in improved short-term and long-term\nbreast cancer risk prediction. Experimental results show that our model\nachieves a statistically significant improvement in performance over the\nstate-of-the-art based model, with a C-index increase from 0.68 to 0.73 (p <\n0.05) on held-out test sets.\n","authors":["Hyeonsoo Lee","Junha Kim","Eunkyung Park","Minjeong Kim","Taesoo Kim","Thijs Kooi"],"pdf_url":"https://arxiv.org/pdf/2303.15699v2.pdf","comment":"MICCAI 2023 accepted"},{"id":"http://arxiv.org/abs/2304.12666v2","updated":"2023-08-28T04:43:57Z","published":"2023-04-25T09:12:37Z","title":"Bayesian Optimization Meets Self-Distillation","summary":" Bayesian optimization (BO) has contributed greatly to improving model\nperformance by suggesting promising hyperparameter configurations iteratively\nbased on observations from multiple training trials. However, only partial\nknowledge (i.e., the measured performances of trained models and their\nhyperparameter configurations) from previous trials is transferred. On the\nother hand, Self-Distillation (SD) only transfers partial knowledge learned by\nthe task model itself. To fully leverage the various knowledge gained from all\ntraining trials, we propose the BOSS framework, which combines BO and SD. BOSS\nsuggests promising hyperparameter configurations through BO and carefully\nselects pre-trained models from previous trials for SD, which are otherwise\nabandoned in the conventional BO process. BOSS achieves significantly better\nperformance than both BO and SD in a wide range of tasks including general\nimage classification, learning with noisy labels, semi-supervised learning, and\nmedical image analysis tasks.\n","authors":["HyunJae Lee","Heon Song","Hyeonsoo Lee","Gi-hyeon Lee","Suyeong Park","Donggeun Yoo"],"pdf_url":"https://arxiv.org/pdf/2304.12666v2.pdf","comment":"ICCV 2023 accepted"},{"id":"http://arxiv.org/abs/2308.14298v1","updated":"2023-08-28T04:34:50Z","published":"2023-08-28T04:34:50Z","title":"Direct initial orbit determination","summary":" Initial orbit determination (IOD) is an important early step in the\nprocessing chain that makes sense of and reconciles the multiple optical\nobservations of a resident space object. IOD methods generally operate on\nline-of-sight (LOS) vectors extracted from images of the object, hence the LOS\nvectors can be seen as discrete point samples of the raw optical measurements.\nTypically, the number of LOS vectors used by an IOD method is much smaller than\nthe available measurements (\\ie, the set of pixel intensity values), hence\ncurrent IOD methods arguably under-utilize the rich information present in the\ndata. In this paper, we propose a \\emph{direct} IOD method called D-IOD that\nfits the orbital parameters directly on the observed streak images, without\nrequiring LOS extraction. Since it does not utilize LOS vectors, D-IOD avoids\npotential inaccuracies or errors due to an imperfect LOS extraction step. Two\ninnovations underpin our novel orbit-fitting paradigm: first, we introduce a\nnovel non-linear least-squares objective function that computes the loss\nbetween the candidate-orbit-generated streak images and the observed streak\nimages. Second, the objective function is minimized with a gradient descent\napproach that is embedded in our proposed optimization strategies designed for\nstreak images. We demonstrate the effectiveness of D-IOD on a variety of\nsimulated scenarios and challenging real streak images.\n","authors":["Chee-Kheng Chng","Trent Jansen-Sturgeon","Timothy Payne","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2308.14298v1.pdf","comment":"28 pages, 17 figures, Submitted to Advances in Space Research"},{"id":"http://arxiv.org/abs/2308.06725v2","updated":"2023-08-28T04:27:35Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":" Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\nhttps://yuyangyin.github.io/CLEDiffusion/\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v2.pdf","comment":"Accepted In Proceedings of the 31st ACM International Conference on\n Multimedia (MM' 23)"},{"id":"http://arxiv.org/abs/2307.05016v2","updated":"2023-08-28T04:05:15Z","published":"2023-07-11T05:32:21Z","title":"TRansPose: Large-Scale Multispectral Dataset for Transparent Object","summary":" Transparent objects are encountered frequently in our daily lives, yet\nrecognizing them poses challenges for conventional vision sensors due to their\nunique material properties, not being well perceived from RGB or depth cameras.\nOvercoming this limitation, thermal infrared cameras have emerged as a\nsolution, offering improved visibility and shape information for transparent\nobjects. In this paper, we present TRansPose, the first large-scale\nmultispectral dataset that combines stereo RGB-D, thermal infrared (TIR)\nimages, and object poses to promote transparent object research. The dataset\nincludes 99 transparent objects, encompassing 43 household items, 27 recyclable\ntrashes, 29 chemical laboratory equivalents, and 12 non-transparent objects. It\ncomprises a vast collection of 333,819 images and 4,000,056 annotations,\nproviding instance-level segmentation masks, ground-truth poses, and completed\ndepth information. The data was acquired using a FLIR A65 thermal infrared\n(TIR) camera, two Intel RealSense L515 RGB-D cameras, and a Franka Emika Panda\nrobot manipulator. Spanning 87 sequences, TRansPose covers various challenging\nreal-life scenarios, including objects filled with water, diverse lighting\nconditions, heavy clutter, non-transparent or translucent containers, objects\nin plastic bags, and multi-stacked objects. TRansPose dataset can be accessed\nfrom the following link: https://sites.google.com/view/transpose-dataset\n","authors":["Jeongyun Kim","Myung-Hwan Jeon","Sangwoo Jung","Wooseong Yang","Minwoo Jung","Jaeho Shin","Ayoung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.05016v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.14286v1","updated":"2023-08-28T03:57:37Z","published":"2023-08-28T03:57:37Z","title":"Bridging Cross-task Protocol Inconsistency for Distillation in Dense\n Object Detection","summary":" Knowledge distillation (KD) has shown potential for learning compact models\nin dense object detection. However, the commonly used softmax-based\ndistillation ignores the absolute classification scores for individual\ncategories. Thus, the optimum of the distillation loss does not necessarily\nlead to the optimal student classification scores for dense object detectors.\nThis cross-task protocol inconsistency is critical, especially for dense object\ndetectors, since the foreground categories are extremely imbalanced. To address\nthe issue of protocol differences between distillation and classification, we\npropose a novel distillation method with cross-task consistent protocols,\ntailored for the dense object detection. For classification distillation, we\naddress the cross-task protocol inconsistency problem by formulating the\nclassification logit maps in both teacher and student models as multiple\nbinary-classification maps and applying a binary-classification distillation\nloss to each map. For localization distillation, we design an IoU-based\nLocalization Distillation Loss that is free from specific network structures\nand can be compared with existing localization distillation losses. Our\nproposed method is simple but effective, and experimental results demonstrate\nits superiority over existing methods. Code is available at\nhttps://github.com/TinyTigerPan/BCKD.\n","authors":["Longrong Yang","Xianpan Zhou","Xuewei Li","Liang Qiao","Zheyang Li","Ziwei Yang","Gaoang Wang","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2308.14286v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2209.15304v4","updated":"2023-08-28T03:16:50Z","published":"2022-09-30T08:23:26Z","title":"Hiding Visual Information via Obfuscating Adversarial Perturbations","summary":" Growing leakage and misuse of visual information raise security and privacy\nconcerns, which promotes the development of information protection. Existing\nadversarial perturbations-based methods mainly focus on the de-identification\nagainst deep learning models. However, the inherent visual information of the\ndata has not been well protected. In this work, inspired by the Type-I\nadversarial attack, we propose an adversarial visual information hiding method\nto protect the visual privacy of data. Specifically, the method generates\nobfuscating adversarial perturbations to obscure the visual information of the\ndata. Meanwhile, it maintains the hidden objectives to be correctly predicted\nby models. In addition, our method does not modify the parameters of the\napplied model, which makes it flexible for different scenarios. Experimental\nresults on the recognition and classification tasks demonstrate that the\nproposed method can effectively hide visual information and hardly affect the\nperformances of models. The code is available in the supplementary material.\n","authors":["Zhigang Su","Dawei Zhou","Nannan Wangu","Decheng Li","Zhen Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2209.15304v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15663v3","updated":"2023-08-28T03:02:16Z","published":"2022-10-27T17:59:50Z","title":"Deep Generative Models on 3D Representations: A Survey","summary":" Generative models aim to learn the distribution of observed data by\ngenerating new instances. With the advent of neural networks, deep generative\nmodels, including variational autoencoders (VAEs), generative adversarial\nnetworks (GANs), and diffusion models (DMs), have progressed remarkably in\nsynthesizing 2D images. Recently, researchers started to shift focus from 2D to\n3D space, considering that 3D data is more closely aligned with our physical\nworld and holds immense practical potential. However, unlike 2D images, which\npossess an inherent and efficient representation (\\textit{i.e.}, a pixel grid),\nrepresenting 3D data poses significantly greater challenges. Ideally, a robust\n3D representation should be capable of accurately modeling complex shapes and\nappearances while being highly efficient in handling high-resolution data with\nhigh processing speeds and low memory requirements. Regrettably, existing 3D\nrepresentations, such as point clouds, meshes, and neural fields, often fail to\nsatisfy all of these requirements simultaneously. In this survey, we thoroughly\nreview the ongoing developments of 3D generative models, including methods that\nemploy 2D and 3D supervision. Our analysis centers on generative models, with a\nparticular focus on the representations utilized in this context. We believe\nour survey will help the community to track the field's evolution and to spark\ninnovative ideas to propel progress towards solving this challenging task.\n","authors":["Zifan Shi","Sida Peng","Yinghao Xu","Andreas Geiger","Yiyi Liao","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2210.15663v3.pdf","comment":"Github: https://github.com/justimyhxu/awesome-3D-generation"},{"id":"http://arxiv.org/abs/2211.01146v3","updated":"2023-08-28T02:59:24Z","published":"2022-11-02T14:22:50Z","title":"DynamicISP: Dynamically Controlled Image Signal Processor for Image\n Recognition","summary":" Image Signal Processors (ISPs) play important roles in image recognition\ntasks as well as in the perceptual quality of captured images. In most cases,\nexperts make a lot of effort to manually tune many parameters of ISPs, but the\nparameters are sub-optimal. In the literature, two types of techniques have\nbeen actively studied: a machine learning-based parameter tuning technique and\na DNN-based ISP technique. The former is lightweight but lacks expressive\npower. The latter has expressive power, but the computational cost is too heavy\non edge devices. To solve these problems, we propose \"DynamicISP,\" which\nconsists of multiple classical ISP functions and dynamically controls the\nparameters of each frame according to the recognition result of the previous\nframe. We show our method successfully controls the parameters of multiple ISP\nfunctions and achieves state-of-the-art accuracy with low computational cost in\nsingle and multi-category object detection tasks.\n","authors":["Masakazu Yoshimura","Junji Otsuka","Atsushi Irie","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2211.01146v3.pdf","comment":"Accepted to ICCV2023. Several updates from v2 including additional\n experiments and modification of typos in Auto Gain equation"},{"id":"http://arxiv.org/abs/2212.04636v3","updated":"2023-08-28T02:51:25Z","published":"2022-12-09T02:25:20Z","title":"Ego-Body Pose Estimation via Ego-Head Pose Estimation","summary":" Estimating 3D human motion from an egocentric video sequence plays a critical\nrole in human behavior understanding and has various applications in VR/AR.\nHowever, naively learning a mapping between egocentric videos and human motions\nis challenging, because the user's body is often unobserved by the front-facing\ncamera placed on the head of the user. In addition, collecting large-scale,\nhigh-quality datasets with paired egocentric videos and 3D human motions\nrequires accurate motion capture devices, which often limit the variety of\nscenes in the videos to lab-like environments. To eliminate the need for paired\negocentric video and human motions, we propose a new method, Ego-Body Pose\nEstimation via Ego-Head Pose Estimation (EgoEgo), which decomposes the problem\ninto two stages, connected by the head motion as an intermediate\nrepresentation. EgoEgo first integrates SLAM and a learning approach to\nestimate accurate head motion. Subsequently, leveraging the estimated head pose\nas input, EgoEgo utilizes conditional diffusion to generate multiple plausible\nfull-body motions. This disentanglement of head and body pose eliminates the\nneed for training datasets with paired egocentric videos and 3D human motion,\nenabling us to leverage large-scale egocentric video datasets and motion\ncapture datasets separately. Moreover, for systematic benchmarking, we develop\na synthetic dataset, AMASS-Replica-Ego-Syn (ARES), with paired egocentric\nvideos and human motion. On both ARES and real data, our EgoEgo model performs\nsignificantly better than the current state-of-the-art methods.\n","authors":["Jiaman Li","C. Karen Liu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2212.04636v3.pdf","comment":"CVPR 2023 (Award Candidate)"},{"id":"http://arxiv.org/abs/2308.14267v1","updated":"2023-08-28T02:49:07Z","published":"2023-08-28T02:49:07Z","title":"Unleash Model Potential: Bootstrapped Meta Self-supervised Learning","summary":" The long-term goal of machine learning is to learn general visual\nrepresentations from a small amount of data without supervision, mimicking\nthree advantages of human cognition: i) no need for labels, ii) robustness to\ndata scarcity, and iii) learning from experience. Self-supervised learning and\nmeta-learning are two promising techniques to achieve this goal, but they both\nonly partially capture the advantages and fail to address all the problems.\nSelf-supervised learning struggles to overcome the drawbacks of data scarcity,\nwhile ignoring prior knowledge that can facilitate learning and generalization.\nMeta-learning relies on supervised information and suffers from a bottleneck of\ninsufficient learning. To address these issues, we propose a novel Bootstrapped\nMeta Self-Supervised Learning (BMSSL) framework that aims to simulate the human\nlearning process. We first analyze the close relationship between meta-learning\nand self-supervised learning. Based on this insight, we reconstruct tasks to\nleverage the strengths of both paradigms, achieving advantages i and ii.\nMoreover, we employ a bi-level optimization framework that alternates between\nsolving specific tasks with a learned ability (first level) and improving this\nability (second level), attaining advantage iii. To fully harness its power, we\nintroduce a bootstrapped target based on meta-gradient to make the model its\nown teacher. We validate the effectiveness of our approach with comprehensive\ntheoretical and empirical study.\n","authors":["Jingyao Wang","Zeen Song","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.14267v1.pdf","comment":"submitted to NIPS"},{"id":"http://arxiv.org/abs/2304.12685v2","updated":"2023-08-28T02:23:05Z","published":"2023-04-25T09:39:30Z","title":"Exploring the Mutual Influence between Self-Supervised Single-Frame and\n Multi-Frame Depth Estimation","summary":" Although both self-supervised single-frame and multi-frame depth estimation\nmethods only require unlabeled monocular videos for training, the information\nthey leverage varies because single-frame methods mainly rely on\nappearance-based features while multi-frame methods focus on geometric cues.\nConsidering the complementary information of single-frame and multi-frame\nmethods, some works attempt to leverage single-frame depth to improve\nmulti-frame depth. However, these methods can neither exploit the difference\nbetween single-frame depth and multi-frame depth to improve multi-frame depth\nnor leverage multi-frame depth to optimize single-frame depth models. To fully\nutilize the mutual influence between single-frame and multi-frame methods, we\npropose a novel self-supervised training framework. Specifically, we first\nintroduce a pixel-wise adaptive depth sampling module guided by single-frame\ndepth to train the multi-frame model. Then, we leverage the minimum\nreprojection based distillation loss to transfer the knowledge from the\nmulti-frame depth network to the single-frame network to improve single-frame\ndepth. Finally, we regard the improved single-frame depth as a prior to further\nboost the performance of multi-frame depth estimation. Experimental results on\nthe KITTI and Cityscapes datasets show that our method outperforms existing\napproaches in the self-supervised monocular setting.\n","authors":["Jie Xiang","Yun Wang","Lifeng An","Haiyang Liu","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2304.12685v2.pdf","comment":"Accepted for publication in the IEEE Robotics and Automation Letters\n (RA-L). 8 pages, 3figures"},{"id":"http://arxiv.org/abs/2308.14256v1","updated":"2023-08-28T02:20:44Z","published":"2023-08-28T02:20:44Z","title":"FaceChain: A Playground for Identity-Preserving Portrait Generation","summary":" Recent advancement in personalized image generation have unveiled the\nintriguing capability of pre-trained text-to-image models on learning identity\ninformation from a collection of portrait images. However, existing solutions\ncan be vulnerable in producing truthful details, and usually suffer from\nseveral defects such as (i) The generated face exhibit its own unique\ncharacteristics, \\ie facial shape and facial feature positioning may not\nresemble key characteristics of the input, and (ii) The synthesized face may\ncontain warped, blurred or corrupted regions. In this paper, we present\nFaceChain, a personalized portrait generation framework that combines a series\nof customized image-generation model and a rich set of face-related perceptual\nunderstanding models (\\eg, face detection, deep face embedding extraction, and\nfacial attribute recognition), to tackle aforementioned challenges and to\ngenerate truthful personalized portraits, with only a handful of portrait\nimages as input. Concretely, we inject several SOTA face models into the\ngeneration procedure, achieving a more efficient label-tagging,\ndata-processing, and model post-processing compared to previous solutions, such\nas DreamBooth ~\\cite{ruiz2023dreambooth} , InstantBooth\n~\\cite{shi2023instantbooth} , or other LoRA-only approaches ~\\cite{hu2021lora}\n. Through the development of FaceChain, we have identified several potential\ndirections to accelerate development of Face/Human-Centric AIGC research and\napplication. We have designed FaceChain as a framework comprised of pluggable\ncomponents that can be easily adjusted to accommodate different styles and\npersonalized needs. We hope it can grow to serve the burgeoning needs from the\ncommunities. FaceChain is open-sourced under Apache-2.0 license at\n\\url{https://github.com/modelscope/facechain}.\n","authors":["Yang Liu","Cheng Yu","Lei Shang","Ziheng Wu","Xingjun Wang","Yuze Zhao","Lin Zhu","Chen Cheng","Weitao Chen","Chao Xu","Haoyu Xie","Yuan Yao","Wenmeng Zhou","Yingda Chen","Xuansong Xie","Baigui Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14256v1.pdf","comment":"This is an ongoing work that will be consistently refined and\n improved upon"},{"id":"http://arxiv.org/abs/2308.14244v1","updated":"2023-08-28T01:19:33Z","published":"2023-08-28T01:19:33Z","title":"HoloFusion: Towards Photo-realistic 3D Generative Modeling","summary":" Diffusion-based image generators can now produce high-quality and diverse\nsamples, but their success has yet to fully translate to 3D generation:\nexisting diffusion methods can either generate low-resolution but 3D consistent\noutputs, or detailed 2D views of 3D objects but with potential structural\ndefects and lacking view consistency or realism. We present HoloFusion, a\nmethod that combines the best of these approaches to produce high-fidelity,\nplausible, and diverse 3D samples while learning from a collection of\nmulti-view 2D images only. The method first generates coarse 3D samples using a\nvariant of the recently proposed HoloDiffusion generator. Then, it\nindependently renders and upsamples a large number of views of the coarse 3D\nmodel, super-resolves them to add detail, and distills those into a single,\nhigh-fidelity implicit 3D representation, which also ensures view consistency\nof the final renders. The super-resolution network is trained as an integral\npart of HoloFusion, end-to-end, and the final distillation uses a new sampling\nscheme to capture the space of super-resolved signals. We compare our method\nagainst existing baselines, including DreamFusion, Get3D, EG3D, and\nHoloDiffusion, and achieve, to the best of our knowledge, the most realistic\nresults on the challenging CO3Dv2 dataset.\n","authors":["Animesh Karnewar","Niloy J. Mitra","Andrea Vedaldi","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2308.14244v1.pdf","comment":"ICCV 2023 conference; project page at:\n https://holodiffusion.github.io/holofusion"},{"id":"http://arxiv.org/abs/2305.19867v2","updated":"2023-08-28T23:47:07Z","published":"2023-05-31T14:04:11Z","title":"Unsupervised Anomaly Detection in Medical Images Using Masked Diffusion\n Model","summary":" It can be challenging to identify brain MRI anomalies using supervised\ndeep-learning techniques due to anatomical heterogeneity and the requirement\nfor pixel-level labeling. Unsupervised anomaly detection approaches provide an\nalternative solution by relying only on sample-level labels of healthy brains\nto generate a desired representation to identify abnormalities at the pixel\nlevel. Although, generative models are crucial for generating such anatomically\nconsistent representations of healthy brains, accurately generating the\nintricate anatomy of the human brain remains a challenge. In this study, we\npresent a method called masked-DDPM (mDPPM), which introduces masking-based\nregularization to reframe the generation task of diffusion models.\nSpecifically, we introduce Masked Image Modeling (MIM) and Masked Frequency\nModeling (MFM) in our self-supervised approach that enables models to learn\nvisual representations from unlabeled data. To the best of our knowledge, this\nis the first attempt to apply MFM in DPPM models for medical applications. We\nevaluate our approach on datasets containing tumors and numerous sclerosis\nlesions and exhibit the superior performance of our unsupervised method as\ncompared to the existing fully/weakly supervised baselines. Code is available\nat https://github.com/hasan1292/mDDPM.\n","authors":["Hasan Iqbal","Umar Khalid","Jing Hua","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19867v2.pdf","comment":"Accepted in MICCAI 2023 Workshops"},{"id":"http://arxiv.org/abs/2308.14938v1","updated":"2023-08-28T23:33:07Z","published":"2023-08-28T23:33:07Z","title":"Entropy-based Guidance of Deep Neural Networks for Accelerated\n Convergence and Improved Performance","summary":" Neural networks have dramatically increased our capacity to learn from large,\nhigh-dimensional datasets across innumerable disciplines. However, their\ndecisions are not easily interpretable, their computational costs are high, and\nbuilding and training them are uncertain processes. To add structure to these\nefforts, we derive new mathematical results to efficiently measure the changes\nin entropy as fully-connected and convolutional neural networks process data,\nand introduce entropy-based loss terms. Experiments in image compression and\nimage classification on benchmark datasets demonstrate these losses guide\nneural networks to learn rich latent data representations in fewer dimensions,\nconverge in fewer training epochs, and achieve better test metrics.\n","authors":["Mackenzie J. Meni","Ryan T. White","Michael Mayo","Kevin Pilkiewicz"],"pdf_url":"https://arxiv.org/pdf/2308.14938v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14936v1","updated":"2023-08-28T23:23:53Z","published":"2023-08-28T23:23:53Z","title":"Auto-Prompting SAM for Mobile Friendly 3D Medical Image Segmentation","summary":" The Segment Anything Model (SAM) has rapidly been adopted for segmenting a\nwide range of natural images. However, recent studies have indicated that SAM\nexhibits subpar performance on 3D medical image segmentation tasks. In addition\nto the domain gaps between natural and medical images, disparities in the\nspatial arrangement between 2D and 3D images, the substantial computational\nburden imposed by powerful GPU servers, and the time-consuming manual prompt\ngeneration impede the extension of SAM to a broader spectrum of medical image\nsegmentation applications. To address these challenges, in this work, we\nintroduce a novel method, AutoSAM Adapter, designed specifically for 3D\nmulti-organ CT-based segmentation. We employ parameter-efficient adaptation\ntechniques in developing an automatic prompt learning paradigm to facilitate\nthe transformation of the SAM model's capabilities to 3D medical image\nsegmentation, eliminating the need for manually generated prompts. Furthermore,\nwe effectively transfer the acquired knowledge of the AutoSAM Adapter to other\nlightweight models specifically tailored for 3D medical image analysis,\nachieving state-of-the-art (SOTA) performance on medical image segmentation\ntasks. Through extensive experimental evaluation, we demonstrate the AutoSAM\nAdapter as a critical foundation for effectively leveraging the emerging\nability of foundation models in 2D natural image segmentation for 3D medical\nimage segmentation.\n","authors":["Chengyin Li","Prashant Khanduri","Yao Qiang","Rafi Ibn Sultan","Indrin Chetty","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14936v1.pdf","comment":"9 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.14930v1","updated":"2023-08-28T23:08:32Z","published":"2023-08-28T23:08:32Z","title":"Application of Quantum Pre-Processing Filter for Binary Image\n Classification with Small Samples","summary":" Over the past few years, there has been significant interest in Quantum\nMachine Learning (QML) among researchers, as it has the potential to transform\nthe field of machine learning. Several models that exploit the properties of\nquantum mechanics have been developed for practical applications. In this\nstudy, we investigated the application of our previously proposed quantum\npre-processing filter (QPF) to binary image classification. We evaluated the\nQPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits\nand alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic\nsign images). Similar to our previous multi-class classification results, the\napplication of QPF improved the binary image classification accuracy using\nneural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8%\nto 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from\n93.5% to 92.0%. We then applied QPF in cases using a smaller number of training\nand testing samples, i.e. 80 and 20 samples per class, respectively. In order\nto derive statistically stable results, we conducted the experiment with 100\ntrials choosing randomly different training and testing samples and averaging\nthe results. The result showed that the application of QPF did not improve the\nimage classification accuracy against MNIST and EMNIST but improved it against\nCIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively.\nFurther research will be conducted as part of future work to investigate the\npotential of QPF to assess the scalability of the proposed approach to larger\nand complex datasets.\n","authors":["Farina Riaz","Shahab Abdulla","Hajime Suzuki","Srinjoy Ganguly","Ravinesh C. Deo","Susan Hopkins"],"pdf_url":"https://arxiv.org/pdf/2308.14930v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2304.11751v2","updated":"2023-08-28T22:38:31Z","published":"2023-04-23T21:05:59Z","title":"Score-Based Diffusion Models as Principled Priors for Inverse Imaging","summary":" Priors are essential for reconstructing images from noisy and/or incomplete\nmeasurements. The choice of the prior determines both the quality and\nuncertainty of recovered images. We propose turning score-based diffusion\nmodels into principled image priors (\"score-based priors\") for analyzing a\nposterior of images given measurements. Previously, probabilistic priors were\nlimited to handcrafted regularizers and simple distributions. In this work, we\nempirically validate the theoretically-proven probability function of a\nscore-based diffusion model. We show how to sample from resulting posteriors by\nusing this probability function for variational inference. Our results,\nincluding experiments on denoising, deblurring, and interferometric imaging,\nsuggest that score-based priors enable principled inference with a\nsophisticated, data-driven image prior.\n","authors":["Berthy T. Feng","Jamie Smith","Michael Rubinstein","Huiwen Chang","Katherine L. Bouman","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2304.11751v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14922v1","updated":"2023-08-28T22:32:15Z","published":"2023-08-28T22:32:15Z","title":"Automated Conversion of Music Videos into Lyric Videos","summary":" Musicians and fans often produce lyric videos, a form of music videos that\nshowcase the song's lyrics, for their favorite songs. However, making such\nvideos can be challenging and time-consuming as the lyrics need to be added in\nsynchrony and visual harmony with the video. Informed by prior work and close\nexamination of existing lyric videos, we propose a set of design guidelines to\nhelp creators make such videos. Our guidelines ensure the readability of the\nlyric text while maintaining a unified focus of attention. We instantiate these\nguidelines in a fully automated pipeline that converts an input music video\ninto a lyric video. We demonstrate the robustness of our pipeline by generating\nlyric videos from a diverse range of input sources. A user study shows that\nlyric videos generated by our pipeline are effective in maintaining text\nreadability and unifying the focus of attention.\n","authors":["Jiaju Ma","Anyi Rao","Li-Yi Wei","Rubaiat Habib Kazi","Hijung Valentina Shin","Maneesh Agrawala"],"pdf_url":"https://arxiv.org/pdf/2308.14922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03407v2","updated":"2023-08-28T22:25:15Z","published":"2023-08-07T08:48:46Z","title":"Spatially Varying Nanophotonic Neural Networks","summary":" The explosive growth of computation and energy cost of artificial\nintelligence has spurred strong interests in new computing modalities as\npotential alternatives to conventional electronic processors. Photonic\nprocessors that execute operations using photons instead of electrons, have\npromised to enable optical neural networks with ultra-low latency and power\nconsumption. However, existing optical neural networks, limited by the\nunderlying network designs, have achieved image recognition accuracy much lower\nthan state-of-the-art electronic neural networks. In this work, we close this\ngap by introducing a large-kernel spatially-varying convolutional neural\nnetwork learned via low-dimensional reparameterization techniques. We\nexperimentally instantiate the network with a flat meta-optical system that\nencompasses an array of nanophotonic structures designed to induce\nangle-dependent responses. Combined with an extremely lightweight electronic\nbackend with approximately 2K parameters we demonstrate a nanophotonic neural\nnetwork reaches 73.80\\% blind test classification accuracy on CIFAR-10 dataset,\nand, as such, the first time, an optical neural network outperforms the first\nmodern digital neural network -- AlexNet (72.64\\%) with 57M parameters,\nbringing optical neural network into modern deep learning era.\n","authors":["Kaixuan Wei","Xiao Li","Johannes Froech","Praneeth Chakravarthula","James Whitehead","Ethan Tseng","Arka Majumdar","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2308.03407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14904v1","updated":"2023-08-28T21:13:04Z","published":"2023-08-28T21:13:04Z","title":"Maturity-Aware Active Learning for Semantic Segmentation with\n Hierarchically-Adaptive Sample Assessment","summary":" Active Learning (AL) for semantic segmentation is challenging due to heavy\nclass imbalance and different ways of defining \"sample\" (pixels, areas, etc.),\nleaving the interpretation of the data distribution ambiguous. We propose\n\"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL\nmethod that benefits from a hierarchical approach to define a multiview data\ndistribution, which takes into account the different \"sample\" definitions\njointly, hence able to select the most impactful segmentation pixels with\ncomprehensive understanding. MADBAL also features a novel uncertainty\nformulation, where AL supporting modules are included to sense the features'\nmaturity whose weighted influence continuously contributes to the uncertainty\ndetection. In this way, MADBAL makes significant performance leaps even in the\nearly AL stage, hence reducing the training burden significantly. It\noutperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as\nverified in our extensive experiments.\n","authors":["Amirsaeed Yazdani","Xuelu Li","Vishal Monga"],"pdf_url":"https://arxiv.org/pdf/2308.14904v1.pdf","comment":"Accepted to the 34th British Machine Vision Conference (BMVC 2023)"},{"id":"http://arxiv.org/abs/2308.14900v1","updated":"2023-08-28T20:59:15Z","published":"2023-08-28T20:59:15Z","title":"BIT: Bi-Level Temporal Modeling for Efficient Supervised Action\n Segmentation","summary":" We address the task of supervised action segmentation which aims to partition\na video into non-overlapping segments, each representing a different action.\nRecent works apply transformers to perform temporal modeling at the\nframe-level, which suffer from high computational cost and cannot well capture\naction dependencies over long temporal horizons. To address these issues, we\npropose an efficient BI-level Temporal modeling (BIT) framework that learns\nexplicit action tokens to represent action segments, in parallel performs\ntemporal modeling on frame and action levels, while maintaining a low\ncomputational cost. Our model contains (i) a frame branch that uses convolution\nto learn frame-level relationships, (ii) an action branch that uses transformer\nto learn action-level dependencies with a small set of action tokens and (iii)\ncross-attentions to allow communication between the two branches. We apply and\nextend a set-prediction objective to allow each action token to represent one\nor multiple action segments, thus can avoid learning a large number of tokens\nover long videos with many segments. Thanks to the design of our action branch,\nwe can also seamlessly leverage textual transcripts of videos (when available)\nto help action segmentation by using them to initialize the action tokens. We\nevaluate our model on four video datasets (two egocentric and two third-person)\nfor action segmentation with and without transcripts, showing that BIT\nsignificantly improves the state-of-the-art accuracy with much lower\ncomputational cost (30 times faster) compared to existing transformer-based\nmethods.\n","authors":["Zijia Lu","Ehsan Elhamifar"],"pdf_url":"https://arxiv.org/pdf/2308.14900v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.14899v1","updated":"2023-08-28T20:52:18Z","published":"2023-08-28T20:52:18Z","title":"RobustCLEVR: A Benchmark and Framework for Evaluating Robustness in\n Object-centric Learning","summary":" Object-centric representation learning offers the potential to overcome\nlimitations of image-level representations by explicitly parsing image scenes\ninto their constituent components. While image-level representations typically\nlack robustness to natural image corruptions, the robustness of object-centric\nmethods remains largely untested. To address this gap, we present the\nRobustCLEVR benchmark dataset and evaluation framework. Our framework takes a\nnovel approach to evaluating robustness by enabling the specification of causal\ndependencies in the image generation process grounded in expert knowledge and\ncapable of producing a wide range of image corruptions unattainable in existing\nrobustness evaluations. Using our framework, we define several causal models of\nthe image corruption process which explicitly encode assumptions about the\ncausal relationships and distributions of each corruption type. We generate\ndataset variants for each causal model on which we evaluate state-of-the-art\nobject-centric methods. Overall, we find that object-centric methods are not\ninherently robust to image corruptions. Our causal evaluation approach exposes\nmodel sensitivities not observed using conventional evaluation processes,\nyielding greater insight into robustness differences across algorithms. Lastly,\nwhile conventional robustness evaluations view corruptions as\nout-of-distribution, we use our causal framework to show that even training on\nin-distribution image corruptions does not guarantee increased model\nrobustness. This work provides a step towards more concrete and substantiated\nunderstanding of model performance and deterioration under complex corruption\nprocesses of the real-world.\n","authors":["Nathan Drenkow","Mathias Unberath"],"pdf_url":"https://arxiv.org/pdf/2308.14899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14893v1","updated":"2023-08-28T20:30:10Z","published":"2023-08-28T20:30:10Z","title":"When hard negative sampling meets supervised contrastive learning","summary":" State-of-the-art image models predominantly follow a two-stage strategy:\npre-training on large datasets and fine-tuning with cross-entropy loss. Many\nstudies have shown that using cross-entropy can result in sub-optimal\ngeneralisation and stability. While the supervised contrastive loss addresses\nsome limitations of cross-entropy loss by focusing on intra-class similarities\nand inter-class differences, it neglects the importance of hard negative\nmining. We propose that models will benefit from performance improvement by\nweighting negative samples based on their dissimilarity to positive\ncounterparts. In this paper, we introduce a new supervised contrastive learning\nobjective, SCHaNe, which incorporates hard negative sampling during the\nfine-tuning phase. Without requiring specialized architectures, additional\ndata, or extra computational resources, experimental results indicate that\nSCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various\nbenchmarks, with significant gains of up to $3.32\\%$ in few-shot learning\nsettings and $3.41\\%$ in full dataset fine-tuning. Importantly, our proposed\nobjective sets a new state-of-the-art for base models on ImageNet-1k, achieving\nan 86.14\\% accuracy. Furthermore, we demonstrate that the proposed objective\nyields better embeddings and explains the improved effectiveness observed in\nour experiments.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa","Zaiqiao Meng"],"pdf_url":"https://arxiv.org/pdf/2308.14893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16890v2","updated":"2023-08-28T19:38:32Z","published":"2023-06-29T12:22:47Z","title":"Trajectory Poisson multi-Bernoulli mixture filter for traffic monitoring\n using a drone","summary":" This paper proposes a multi-object tracking (MOT) algorithm for traffic\nmonitoring using a drone equipped with optical and thermal cameras. Object\ndetections on the images are obtained using a neural network for each type of\ncamera. The cameras are modelled as direction-of-arrival (DOA) sensors. Each\nDOA detection follows a von-Mises Fisher distribution, whose mean direction is\nobtain by projecting a vehicle position on the ground to the camera. We then\nuse the trajectory Poisson multi-Bernoulli mixture filter (TPMBM), which is a\nBayesian MOT algorithm, to optimally estimate the set of vehicle trajectories.\nWe have also developed a parameter estimation algorithm for the measurement\nmodel. We have tested the accuracy of the resulting TPMBM filter in synthetic\nand experimental data sets.\n","authors":["Ángel F. García-Fernández","Jimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2306.16890v2.pdf","comment":"accepted in IEEE Transactions on Vehicular Technology"},{"id":"http://arxiv.org/abs/2308.14861v1","updated":"2023-08-28T19:31:53Z","published":"2023-08-28T19:31:53Z","title":"Evaluation of Key Spatiotemporal Learners for Print Track Anomaly\n Classification Using Melt Pool Image Streams","summary":" Recent applications of machine learning in metal additive manufacturing (MAM)\nhave demonstrated significant potential in addressing critical barriers to the\nwidespread adoption of MAM technology. Recent research in this field emphasizes\nthe importance of utilizing melt pool signatures for real-time defect\nprediction. While high-quality melt pool image data holds the promise of\nenabling precise predictions, there has been limited exploration into the\nutilization of cutting-edge spatiotemporal models that can harness the inherent\ntransient and sequential characteristics of the additive manufacturing process.\nThis research introduces and puts into practice some of the leading deep\nspatiotemporal learning models that can be adapted for the classification of\nmelt pool image streams originating from various materials, systems, and\napplications. Specifically, it investigates two-stream networks comprising\nspatial and temporal streams, a recurrent spatial network, and a factorized 3D\nconvolutional neural network. The capacity of these models to generalize when\nexposed to perturbations in melt pool image data is examined using data\nperturbation techniques grounded in real-world process scenarios. The\nimplemented architectures demonstrate the ability to capture the spatiotemporal\nfeatures of melt pool image sequences. However, among these models, only the\nKinetics400 pre-trained SlowFast network, categorized as a two-stream network,\nexhibits robust generalization capabilities in the presence of data\nperturbations.\n","authors":["Lynn Cherif","Mutahar Safdar","Guy Lamouche","Priti Wanjara","Padma Paul","Gentry Wood","Max Zimmermann","Florian Hannesen","Yaoyao Fiona Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14861v1.pdf","comment":"This work has been accepted to IFAC for publication under a Creative\n Commons Licence CC-BY-NC-ND"},{"id":"http://arxiv.org/abs/2308.14852v1","updated":"2023-08-28T19:15:27Z","published":"2023-08-28T19:15:27Z","title":"SynthDistill: Face Recognition with Knowledge Distillation from\n Synthetic Data","summary":" State-of-the-art face recognition networks are often computationally\nexpensive and cannot be used for mobile applications. Training lightweight face\nrecognition models also requires large identity-labeled datasets. Meanwhile,\nthere are privacy and ethical concerns with collecting and using large face\nrecognition datasets. While generating synthetic datasets for training face\nrecognition models is an alternative option, it is challenging to generate\nsynthetic data with sufficient intra-class variations. In addition, there is\nstill a considerable gap between the performance of models trained on real and\nsynthetic data. In this paper, we propose a new framework (named SynthDistill)\nto train lightweight face recognition models by distilling the knowledge of a\npretrained teacher face recognition model using synthetic data. We use a\npretrained face generator network to generate synthetic face images and use the\nsynthesized images to learn a lightweight student network. We use synthetic\nface images without identity labels, mitigating the problems in the intra-class\nvariation generation of synthetic datasets. Instead, we propose a novel dynamic\nsampling strategy from the intermediate latent space of the face generator\nnetwork to include new variations of the challenging images while further\nexploring new face images in the training batch. The results on five different\nface recognition datasets demonstrate the superiority of our lightweight model\ncompared to models trained on previous synthetic datasets, achieving a\nverification accuracy of 99.52% on the LFW dataset with a lightweight network.\nThe results also show that our proposed framework significantly reduces the gap\nbetween training with real and synthetic data. The source code for replicating\nthe experiments is publicly released.\n","authors":["Hatef Otroshi Shahreza","Anjith George","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2308.14852v1.pdf","comment":"Accepted in the IEEE International Joint Conference on Biometrics\n (IJCB 2023)"},{"id":"http://arxiv.org/abs/2308.14847v1","updated":"2023-08-28T19:08:17Z","published":"2023-08-28T19:08:17Z","title":"NSF: Neural Surface Fields for Human Modeling from Monocular Depth","summary":" Obtaining personalized 3D animatable avatars from a monocular camera has\nseveral real world applications in gaming, virtual try-on, animation, and\nVR/XR, etc. However, it is very challenging to model dynamic and fine-grained\nclothing deformations from such sparse data. Existing methods for modeling 3D\nhumans from depth data have limitations in terms of computational efficiency,\nmesh coherency, and flexibility in resolution and topology. For instance,\nreconstructing shapes using implicit functions and extracting explicit meshes\nper frame is computationally expensive and cannot ensure coherent meshes across\nframes. Moreover, predicting per-vertex deformations on a pre-designed human\ntemplate with a discrete surface lacks flexibility in resolution and topology.\nTo overcome these limitations, we propose a novel method `\\keyfeature: Neural\nSurface Fields' for modeling 3D clothed humans from monocular depth. NSF\ndefines a neural field solely on the base surface which models a continuous and\nflexible displacement field. NSF can be adapted to the base surface with\ndifferent resolution and topology without retraining at inference time.\nCompared to existing approaches, our method eliminates the expensive per-frame\nsurface extraction while maintaining mesh coherency, and is capable of\nreconstructing meshes with arbitrary resolution without retraining. To foster\nresearch in this direction, we release our code in project page at:\nhttps://yuxuan-xue.com/nsf.\n","authors":["Yuxuan Xue","Bharat Lal Bhatnagar","Riccardo Marin","Nikolaos Sarafianos","Yuanlu Xu","Gerard Pons-Moll","Tony Tung"],"pdf_url":"https://arxiv.org/pdf/2308.14847v1.pdf","comment":"Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf"},{"id":"http://arxiv.org/abs/2308.14833v1","updated":"2023-08-28T18:43:33Z","published":"2023-08-28T18:43:33Z","title":"The Interstate-24 3D Dataset: a new benchmark for 3D multi-camera\n vehicle tracking","summary":" This work presents a novel video dataset recorded from overlapping highway\ntraffic cameras along an urban interstate, enabling multi-camera 3D object\ntracking in a traffic monitoring context. Data is released from 3 scenes\ncontaining video from at least 16 cameras each, totaling 57 minutes in length.\n877,000 3D bounding boxes and corresponding object tracklets are fully and\naccurately annotated for each camera field of view and are combined into a\nspatially and temporally continuous set of vehicle trajectories for each scene.\nLastly, existing algorithms are combined to benchmark a number of 3D\nmulti-camera tracking pipelines on the dataset, with results indicating that\nthe dataset is challenging due to the difficulty of matching objects traveling\nat high speeds across cameras and heavy object occlusion, potentially for\nhundreds of frames, during congested traffic. This work aims to enable the\ndevelopment of accurate and automatic vehicle trajectory extraction algorithms,\nwhich will play a vital role in understanding impacts of autonomous vehicle\ntechnologies on the safety and efficiency of traffic.\n","authors":["Derek Gloudemans","Yanbing Wang","Gracie Gumm","William Barbour","Daniel B. Work"],"pdf_url":"https://arxiv.org/pdf/2308.14833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14831v1","updated":"2023-08-28T18:31:09Z","published":"2023-08-28T18:31:09Z","title":"Continual Learning with Dynamic Sparse Training: Exploring Algorithms\n for Effective Model Updates","summary":" Continual learning (CL) refers to the ability of an intelligent system to\nsequentially acquire and retain knowledge from a stream of data with as little\ncomputational overhead as possible. To this end; regularization, replay,\narchitecture, and parameter isolation approaches were introduced to the\nliterature. Parameter isolation using a sparse network which enables to\nallocate distinct parts of the neural network to different tasks and also\nallows to share of parameters between tasks if they are similar. Dynamic Sparse\nTraining (DST) is a prominent way to find these sparse networks and isolate\nthem for each task. This paper is the first empirical study investigating the\neffect of different DST components under the CL paradigm to fill a critical\nresearch gap and shed light on the optimal configuration of DST for CL if it\nexists. Therefore, we perform a comprehensive study in which we investigate\nvarious DST components to find the best topology per task on well-known\nCIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our\nprimary focus is to evaluate the performance of various DST criteria, rather\nthan the process of mask selection. We found that, at a low sparsity level,\nErdos-Renyi Kernel (ERK) initialization utilizes the backbone more efficiently\nand allows to effectively learn increments of tasks. At a high sparsity level,\nhowever, uniform initialization demonstrates more reliable and robust\nperformance. In terms of growth strategy; performance is dependent on the\ndefined initialization strategy, and the extent of sparsity. Finally,\nadaptivity within DST components is a promising way for better continual\nlearners.\n","authors":["Murat Onur Yildirim","Elif Ceren Gok Yildirim","Ghada Sokar","Decebal Constantin Mocanu","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2308.14831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14816v1","updated":"2023-08-28T18:09:13Z","published":"2023-08-28T18:09:13Z","title":"CLNeRF: Continual Learning Meets NeRF","summary":" Novel view synthesis aims to render unseen views given a set of calibrated\nimages. In practical applications, the coverage, appearance or geometry of the\nscene may change over time, with new images continuously being captured.\nEfficiently incorporating such continuous change is an open challenge. Standard\nNeRF benchmarks only involve scene coverage expansion. To study other practical\nscene changes, we propose a new dataset, World Across Time (WAT), consisting of\nscenes that change in appearance and geometry over time. We also propose a\nsimple yet effective method, CLNeRF, which introduces continual learning (CL)\nto Neural Radiance Fields (NeRFs). CLNeRF combines generative replay and the\nInstant Neural Graphics Primitives (NGP) architecture to effectively prevent\ncatastrophic forgetting and efficiently update the model when new data arrives.\nWe also add trainable appearance and geometry embeddings to NGP, allowing a\nsingle compact model to handle complex scene changes. Without the need to store\nhistorical images, CLNeRF trained sequentially over multiple scans of a\nchanging scene performs on-par with the upper bound model trained on all scans\nat once. Compared to other CL baselines CLNeRF performs much better across\nstandard benchmarks and WAT. The source code, and the WAT dataset are available\nat https://github.com/IntelLabs/CLNeRF. Video presentation is available at:\nhttps://youtu.be/nLRt6OoDGq0?si=8yD6k-8MMBJInQPs\n","authors":["Zhipeng Cai","Matthias Mueller"],"pdf_url":"https://arxiv.org/pdf/2308.14816v1.pdf","comment":"Accepted to ICCV 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.14622v1","updated":"2023-08-28T16:58:44Z","published":"2023-08-28T16:58:44Z","title":"TRIVEA: Transparent Ranking Interpretation using Visual Explanation of\n Black-Box Algorithmic Rankers","summary":" Ranking schemes drive many real-world decisions, like, where to study, whom\nto hire, what to buy, etc. Many of these decisions often come with high\nconsequences. For example, a university can be deemed less prestigious if not\nfeatured in a top-k list, and consumers might not even explore products that do\nnot get recommended to buyers. At the heart of most of these decisions are\nopaque ranking schemes, which dictate the ordering of data entities, but their\ninternal logic is inaccessible or proprietary. Drawing inferences about the\nranking differences is like a guessing game to the stakeholders, like, the\nrankees (i.e., the entities who are ranked, like product companies) and the\ndecision-makers (i.e., who use the rankings, like buyers). In this paper, we\naim to enable transparency in ranking interpretation by using algorithmic\nrankers that learn from available data and by enabling human reasoning about\nthe learned ranking differences using explainable AI (XAI) methods. To realize\nthis aim, we leverage the exploration-explanation paradigm of human-data\ninteraction to let human stakeholders explore subsets and groupings of complex\nmulti-attribute ranking data using visual explanations of model fit and\nattribute influence on rankings. We realize this explanation paradigm for\ntransparent ranking interpretation in TRIVEA, a visual analytic system that is\nfueled by: i) visualizations of model fit derived from algorithmic rankers that\nlearn the associations between attributes and rankings from available data and\nii) visual explanations derived from XAI methods that help abstract important\npatterns, like, the relative influence of attributes in different ranking\nranges. Using TRIVEA, end users not trained in data science have the agency to\ntransparently reason about the global and local behavior of the rankings\nwithout the need to open black-box ranking models and develop confidence in the\nresulting attribute-based inferences. We demonstrate the efficacy of TRIVEA\nusing multiple usage scenarios and subjective feedback from researchers with\ndiverse domain expertise. Keywords: Visual Analytics, Learning-to-Rank,\nExplainable ML, Ranking\n","authors":["Jun Yuan","Kaustav Bhattacharjee","Akm Zahirul Islam","Aritra Dasgupta"],"pdf_url":"https://arxiv.org/pdf/2308.14622v1.pdf","comment":"Accepted for publication in SpringerNature's Visual Computer Journal"},{"id":"http://arxiv.org/abs/2308.14601v1","updated":"2023-08-28T14:12:25Z","published":"2023-08-28T14:12:25Z","title":"Fairness Through Domain Awareness: Mitigating Popularity Bias For Music\n Discovery","summary":" As online music platforms grow, music recommender systems play a vital role\nin helping users navigate and discover content within their vast musical\ndatabases. At odds with this larger goal, is the presence of popularity bias,\nwhich causes algorithmic systems to favor mainstream content over, potentially\nmore relevant, but niche items. In this work we explore the intrinsic\nrelationship between music discovery and popularity bias. To mitigate this\nissue we propose a domain-aware, individual fairness-based approach which\naddresses popularity bias in graph neural network (GNNs) based recommender\nsystems. Our approach uses individual fairness to reflect a ground truth\nlistening experience, i.e., if two songs sound similar, this similarity should\nbe reflected in their representations. In doing so, we facilitate meaningful\nmusic discovery that is robust to popularity bias and grounded in the music\ndomain. We apply our BOOST methodology to two discovery based tasks, performing\nrecommendations at both the playlist level and user level. Then, we ground our\nevaluation in the cold start setting, showing that our approach outperforms\nexisting fairness benchmarks in both performance and recommendation of\nlesser-known content. Finally, our analysis explains why our proposed\nmethodology is a novel and promising approach to mitigating popularity bias and\nimproving the discovery of new and niche content in music recommender systems.\n","authors":["Rebecca Salganik","Fernando Diaz","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2308.14601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14499v1","updated":"2023-08-28T11:19:44Z","published":"2023-08-28T11:19:44Z","title":"Efficient and Accurate Tree Detection from 3D Point Clouds through Paid\n Crowdsourcing","summary":" Accurate tree detection is of growing importance in applications such as\nurban planning, forest inventory, and environmental monitoring. In this\narticle, we present an approach to creating tree maps by annotating them in 3D\npoint clouds. Point cloud representations allow the precise identification of\ntree positions, particularly stem locations, and their heights. Our method\nleverages human computational power through paid crowdsourcing, employing a web\ntool designed to enable even non-experts to effectively tackle the task. The\nprimary focus of this paper is to discuss the web tool's development and\nstrategies to ensure high-quality tree annotations despite encountering noise\nin the crowdsourced data. Following our methodology, we achieve quality\nmeasures surpassing 90% for various challenging test sets of diverse\ncomplexities. We emphasize that our tree map creation process, including\ninitial point cloud collection, can be completed within 1-2 days.\n","authors":["Michael Kölle","Volker Walter","Ivan Shiller","Uwe Soergel"],"pdf_url":"https://arxiv.org/pdf/2308.14499v1.pdf","comment":"This paper can be considered an extension of the approach presented\n by Walter et al.\n (https://isprs-annals.copernicus.org/articles/V-4-2020/49/2020/)"},{"id":"http://arxiv.org/abs/2308.14436v1","updated":"2023-08-28T09:22:02Z","published":"2023-08-28T09:22:02Z","title":"Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware\n Pre-training for KBQA","summary":" Knowledge Base Question Answering (KBQA) aims to answer natural language\nquestions with factual information such as entities and relations in KBs.\nHowever, traditional Pre-trained Language Models (PLMs) are directly\npre-trained on large-scale natural language corpus, which poses challenges for\nthem in understanding and representing complex subgraphs in structured KBs. To\nbridge the gap between texts and structured KBs, we propose a Structured\nKnowledge-aware Pre-training method (SKP). In the pre-training stage, we\nintroduce two novel structured knowledge-aware tasks, guiding the model to\neffectively learn the implicit relationship and better representations of\ncomplex subgraphs. In downstream KBQA task, we further design an efficient\nlinearization strategy and an interval attention mechanism, which assist the\nmodel to better encode complex subgraphs and shield the interference of\nirrelevant subgraphs during reasoning respectively. Detailed experiments and\nanalyses on WebQSP verify the effectiveness of SKP, especially the significant\nimprovement in subgraph retrieval (+4.08% H@10).\n","authors":["Guanting Dong","Rumei Li","Sirui Wang","Yupeng Zhang","Yunsen Xian","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14436v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.09340v2","updated":"2023-08-28T08:00:58Z","published":"2023-08-18T06:52:07Z","title":"How Discriminative Are Your Qrels? How To Study the Statistical\n Significance of Document Adjudication Methods","summary":" Creating test collections for offline retrieval evaluation requires human\neffort to judge documents' relevance. This expensive activity motivated much\nwork in developing methods for constructing benchmarks with fewer assessment\ncosts. In this respect, adjudication methods actively decide both which\ndocuments and the order in which experts review them, in order to better\nexploit the assessment budget or to lower it. Researchers evaluate the quality\nof those methods by measuring the correlation between the known gold ranking of\nsystems under the full collection and the observed ranking of systems under the\nlower-cost one. This traditional analysis ignores whether and how the low-cost\njudgements impact on the statistically significant differences among systems\nwith respect to the full collection. We fill this void by proposing a novel\nmethodology to evaluate how the low-cost adjudication methods preserve the\npairwise significant differences between systems as the full collection. In\nother terms, while traditional approaches look for stability in answering the\nquestion \"is system A better than system B?\", our proposed approach looks for\nstability in answering the question \"is system A significantly better than\nsystem B?\", which is the ultimate questions researchers need to answer to\nguarantee the generalisability of their results. Among other results, we found\nthat the best methods in terms of ranking of systems correlation do not always\nmatch those preserving statistical significance.\n","authors":["David Otero","Javier Parapar","Nicola Ferro"],"pdf_url":"https://arxiv.org/pdf/2308.09340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14355v1","updated":"2023-08-28T07:03:08Z","published":"2023-08-28T07:03:08Z","title":"Can Transformer and GNN Help Each Other?","summary":" Although Transformer has achieved great success in natural language process\nand computer vision, it has difficulty generalizing to medium and large-scale\ngraph data for two important reasons: (i) High complexity. (ii) Failing to\ncapture the complex and entangled structure information. In graph\nrepresentation learning, Graph Neural Networks(GNNs) can fuse the graph\nstructure and node attributes but have limited receptive fields. Therefore, we\nquestion whether can we combine Transformers and GNNs to help each other. In\nthis paper, we propose a new model named TransGNN where the Transformer layer\nand GNN layer are used alternately to improve each other. Specifically, to\nexpand the receptive field and disentangle the information aggregation from\nedges, we propose using Transformer to aggregate more relevant nodes'\ninformation to improve the message passing of GNNs. Besides, to capture the\ngraph structure information, we utilize positional encoding and make use of the\nGNN layer to fuse the structure into node attributes, which improves the\nTransformer in graph data. We also propose to sample the most relevant nodes\nfor Transformer and two efficient samples update strategies to lower the\ncomplexity. At last, we theoretically prove that TransGNN is more expressive\nthan GNNs only with extra linear complexity. The experiments on eight datasets\ncorroborate the effectiveness of TransGNN on node and graph classification\ntasks.\n","authors":["Peiyan Zhang","Yuchen Yan","Chaozhuo Li","Senzhang Wang","Xing Xie","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17426v2","updated":"2023-08-28T05:37:36Z","published":"2023-06-30T06:40:11Z","title":"Leveraging Watch-time Feedback for Short-Video Recommendations: A Causal\n Labeling Framework","summary":" With the proliferation of short video applications, the significance of short\nvideo recommendations has vastly increased. Unlike other recommendation\nscenarios, short video recommendation systems heavily rely on feedback from\nwatch time. Existing approaches simply treat watch time as a direct label,\nfailing to effectively harness its extensive semantics and introduce bias,\nthereby limiting the potential for modeling user interests based on watch time.\nTo overcome this challenge, we propose a framework named Debiased\nMultiple-semantics-extracting Labeling(DML). DML constructs labels that\nencompass various semantics by utilizing quantiles derived from the\ndistribution of watch time, prioritizing relative order rather than absolute\nlabel values. This approach facilitates easier model learning while aligning\nwith the ranking objective of recommendations. Furthermore, we introduce a\nmethod inspired by causal adjustment to refine label definitions, thereby\ndirectly mitigating bias at the label level. We substantiate the effectiveness\nof our DML framework through both online and offline experiments. Extensive\nresults demonstrate that our DML could effectively leverage watch time to\ndiscover users' real interests, enhancing their engagement in our application.\n","authors":["Yang Zhang","Yimeng Bai","Jianxin Chang","Xiaoxue Zang","Song Lu","Jing Lu","Fuli Feng","Yanan Niu","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2306.17426v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14296v1","updated":"2023-08-28T04:31:04Z","published":"2023-08-28T04:31:04Z","title":"RecMind: Large Language Model Powered Agent For Recommendation","summary":" Recent advancements in instructing Large Language Models (LLMs) to utilize\nexternal tools and execute multi-step plans have significantly enhanced their\nability to solve intricate tasks, ranging from mathematical problems to\ncreative writing. Yet, there remains a notable gap in studying the capacity of\nLLMs in responding to personalized queries such as a recommendation request. To\nbridge this gap, we have designed an LLM-powered autonomous recommender agent,\nRecMind, which is capable of providing precise personalized recommendations\nthrough careful planning, utilizing tools for obtaining external knowledge, and\nleveraging individual data. We propose a novel algorithm, Self-Inspiring, to\nimprove the planning ability of the LLM agent. At each intermediate planning\nstep, the LLM 'self-inspires' to consider all previously explored states to\nplan for next step. This mechanism greatly improves the model's ability to\ncomprehend and utilize historical planning information for recommendation. We\nevaluate RecMind's performance in various recommendation scenarios, including\nrating prediction, sequential recommendation, direct recommendation,\nexplanation generation, and review summarization. Our experiment shows that\nRecMind outperforms existing zero/few-shot LLM-based recommendation methods in\ndifferent recommendation tasks and achieves competitive performance to a recent\nmodel P5, which requires fully pre-train for the recommendation tasks.\n","authors":["Yancheng Wang","Ziyan Jiang","Zheng Chen","Fan Yang","Yingxue Zhou","Eunah Cho","Xing Fan","Xiaojiang Huang","Yanbin Lu","Yingzhen Yang"],"pdf_url":"https://arxiv.org/pdf/2308.14296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14276v1","updated":"2023-08-28T03:15:37Z","published":"2023-08-28T03:15:37Z","title":"Alleviating Video-Length Effect for Micro-video Recommendation","summary":" Micro-videos platforms such as TikTok are extremely popular nowadays. One\nimportant feature is that users no longer select interested videos from a set,\ninstead they either watch the recommended video or skip to the next one. As a\nresult, the time length of users' watching behavior becomes the most important\nsignal for identifying preferences. However, our empirical data analysis has\nshown a video-length effect that long videos are easier to receive a higher\nvalue of average view time, thus adopting such view-time labels for measuring\nuser preferences can easily induce a biased model that favors the longer\nvideos. In this paper, we propose a Video Length Debiasing Recommendation\n(VLDRec) method to alleviate such an effect for micro-video recommendation.\nVLDRec designs the data labeling approach and the sample generation module that\nbetter capture user preferences in a view-time oriented manner. It further\nleverages the multi-task learning technique to jointly optimize the above\nsamples with original biased ones. Extensive experiments show that VLDRec can\nimprove the users' view time by 1.81% and 11.32% on two real-world datasets,\ngiven a recommendation list of a fixed overall video length, compared with the\nbest baseline method. Moreover, VLDRec is also more effective in matching\nusers' interests in terms of the video content.\n","authors":["Yuhan Quan","Jingtao Ding","Chen Gao","Nian Li","Lingling Yi","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.14276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14263v1","updated":"2023-08-28T02:38:17Z","published":"2023-08-28T02:38:17Z","title":"Cross-Modal Retrieval: A Systematic Review of Methods and Future\n Directions","summary":" With the exponential surge in diverse multi-modal data, traditional uni-modal\nretrieval methods struggle to meet the needs of users demanding access to data\nfrom various modalities. To address this, cross-modal retrieval has emerged,\nenabling interaction across modalities, facilitating semantic matching, and\nleveraging complementarity and consistency between different modal data.\nAlthough prior literature undertook a review of the cross-modal retrieval\nfield, it exhibits numerous deficiencies pertaining to timeliness, taxonomy,\nand comprehensiveness. This paper conducts a comprehensive review of\ncross-modal retrieval's evolution, spanning from shallow statistical analysis\ntechniques to vision-language pre-training models. Commencing with a\ncomprehensive taxonomy grounded in machine learning paradigms, mechanisms, and\nmodels, the paper then delves deeply into the principles and architectures\nunderpinning existing cross-modal retrieval methods. Furthermore, it offers an\noverview of widely used benchmarks, metrics, and performances. Lastly, the\npaper probes the prospects and challenges that confront contemporary\ncross-modal retrieval, while engaging in a discourse on potential directions\nfor further progress in the field. To facilitate the research on cross-modal\nretrieval, we develop an open-source code repository at\nhttps://github.com/BMC-SDNU/Cross-Modal-Retrieval.\n","authors":["Lei Zhu","Tianshi Wang","Fengling Li","Jingjing Li","Zheng Zhang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14916v1","updated":"2023-08-28T22:26:50Z","published":"2023-08-28T22:26:50Z","title":"RecRec: Algorithmic Recourse for Recommender Systems","summary":" Recommender systems play an essential role in the choices people make in\ndomains such as entertainment, shopping, food, news, employment, and education.\nThe machine learning models underlying these recommender systems are often\nenormously large and black-box in nature for users, content providers, and\nsystem developers alike. It is often crucial for all stakeholders to understand\nthe model's rationale behind making certain predictions and recommendations.\nThis is especially true for the content providers whose livelihoods depend on\nthe recommender system. Drawing motivation from the practitioners' need, in\nthis work, we propose a recourse framework for recommender systems, targeted\ntowards the content providers. Algorithmic recourse in the recommendation\nsetting is a set of actions that, if executed, would modify the recommendations\n(or ranking) of an item in the desired manner. A recourse suggests actions of\nthe form: \"if a feature changes X to Y, then the ranking of that item for a set\nof users will change to Z.\" Furthermore, we demonstrate that RecRec is highly\neffective in generating valid, sparse, and actionable recourses through an\nempirical evaluation of recommender systems trained on three real-world\ndatasets. To the best of our knowledge, this work is the first to conceptualize\nand empirically test a generalized framework for generating recourses for\nrecommender systems.\n","authors":["Sahil Verma","Ashudeep Singh","Varich Boonsanong","John P. Dickerson","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14916v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.14902v1","updated":"2023-08-28T21:08:06Z","published":"2023-08-28T21:08:06Z","title":"Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in\n Recommendation Networks","summary":" Recommendation models are vital in delivering personalized user experiences\nby leveraging the correlation between multiple input features. However, deep\nlearning-based recommendation models often face challenges due to evolving user\nbehaviour and item features, leading to covariate shifts. Effective\ncross-feature learning is crucial to handle data distribution drift and\nadapting to changing user behaviour. Traditional feature interaction techniques\nhave limitations in achieving optimal performance in this context.\n This work introduces Ad-Rec, an advanced network that leverages feature\ninteraction techniques to address covariate shifts. This helps eliminate\nirrelevant interactions in recommendation tasks. Ad-Rec leverages masked\ntransformers to enable the learning of higher-order cross-features while\nmitigating the impact of data distribution drift. Our approach improves model\nquality, accelerates convergence, and reduces training time, as measured by the\nArea Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its\nability to achieve superior model quality through comprehensive ablation\nstudies.\n","authors":["Muhammad Adnan","Yassaman Ebrahimzadeh Maboud","Divya Mahajan","Prashant J. Nair"],"pdf_url":"https://arxiv.org/pdf/2308.14902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14786v1","updated":"2023-08-28T17:07:31Z","published":"2023-08-28T17:07:31Z","title":"Extending Cross-Modal Retrieval with Interactive Learning to Improve\n Image Retrieval Performance in Forensics","summary":" Nowadays, one of the critical challenges in forensics is analyzing the\nenormous amounts of unstructured digital evidence, such as images. Often,\nunstructured digital evidence contains precious information for forensic\ninvestigations. Therefore, a retrieval system that can effectively identify\nforensically relevant images is paramount. In this work, we explored the\neffectiveness of interactive learning in improving image retrieval performance\nin the forensic domain by proposing Excalibur - a zero-shot cross-modal image\nretrieval system extended with interactive learning. Excalibur was evaluated\nusing both simulations and a user study. The simulations reveal that\ninteractive learning is highly effective in improving retrieval performance in\nthe forensic domain. Furthermore, user study participants could effectively\nleverage the power of interactive learning. Finally, they considered Excalibur\neffective and straightforward to use and expressed interest in using it in\ntheir daily practice.\n","authors":["Nils Böhne","Mark Berger","Ronald van Velzen"],"pdf_url":"https://arxiv.org/pdf/2308.14786v1.pdf","comment":"Submitted to the AAAI22 conference"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.14753v1","updated":"2023-08-28T17:59:47Z","published":"2023-08-28T17:59:47Z","title":"Efficient Discovery and Effective Evaluation of Visual Perceptual\n Similarity: A Benchmark and Beyond","summary":" Visual similarities discovery (VSD) is an important task with broad\ne-commerce applications. Given an image of a certain object, the goal of VSD is\nto retrieve images of different objects with high perceptual visual similarity.\nAlthough being a highly addressed problem, the evaluation of proposed methods\nfor VSD is often based on a proxy of an identification-retrieval task,\nevaluating the ability of a model to retrieve different images of the same\nobject. We posit that evaluating VSD methods based on identification tasks is\nlimited, and faithful evaluation must rely on expert annotations. In this\npaper, we introduce the first large-scale fashion visual similarity benchmark\ndataset, consisting of more than 110K expert-annotated image pairs. Besides\nthis major contribution, we share insight from the challenges we faced while\ncurating this dataset. Based on these insights, we propose a novel and\nefficient labeling procedure that can be applied to any dataset. Our analysis\nexamines its limitations and inductive biases, and based on these findings, we\npropose metrics to mitigate those limitations. Though our primary focus lies on\nvisual similarity, the methodologies we present have broader applications for\ndiscovering and evaluating perceptual similarity across various domains.\n","authors":["Oren Barkan","Tal Reiss","Jonathan Weill","Ori Katz","Roy Hirsch","Itzik Malkiel","Noam Koenigstein"],"pdf_url":"https://arxiv.org/pdf/2308.14753v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14742v1","updated":"2023-08-28T17:43:04Z","published":"2023-08-28T17:43:04Z","title":"Minimizing Quasi-Self-Concordant Functions by Gradient Regularization of\n Newton Method","summary":" We study the composite convex optimization problems with a\nQuasi-Self-Concordant smooth component. This problem class naturally\ninterpolates between classic Self-Concordant functions and functions with\nLipschitz continuous Hessian. Previously, the best complexity bounds for this\nproblem class were associated with trust-region schemes and implementations of\na ball-minimization oracle. In this paper, we show that for minimizing\nQuasi-Self-Concordant functions we can use instead the basic Newton Method with\nGradient Regularization. For unconstrained minimization, it only involves a\nsimple matrix inversion operation (solving a linear system) at each step. We\nprove a fast global linear rate for this algorithm, matching the complexity\nbound of the trust-region scheme, while our method remains especially simple to\nimplement. Then, we introduce the Dual Newton Method, and based on it, develop\nthe corresponding Accelerated Newton Scheme for this problem class, which\nfurther improves the complexity factor of the basic method. As a direct\nconsequence of our results, we establish fast global linear rates of simple\nvariants of the Newton Method applied to several practical problems, including\nLogistic Regression, Soft Maximum, and Matrix Scaling, without requiring\nadditional assumptions on strong or uniform convexity for the target objective.\n","authors":["Nikita Doikov"],"pdf_url":"https://arxiv.org/pdf/2308.14742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14740v1","updated":"2023-08-28T17:41:14Z","published":"2023-08-28T17:41:14Z","title":"Total Selfie: Generating Full-Body Selfies","summary":" We present a method to generate full-body selfies -- photos that you take of\nyourself, but capturing your whole body as if someone else took the photo of\nyou from a few feet away. Our approach takes as input a pre-captured video of\nyour body, a target pose photo, and a selfie + background pair for each\nlocation. We introduce a novel diffusion-based approach to combine all of this\ninformation into high quality, well-composed photos of you with the desired\npose and background.\n","authors":["Bowei Chen","Brian Curless","Ira Kemelmacher-Shlizerman","Steve Seitz"],"pdf_url":"https://arxiv.org/pdf/2308.14740v1.pdf","comment":"Project page:\n https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/"},{"id":"http://arxiv.org/abs/2306.12926v2","updated":"2023-08-28T17:33:56Z","published":"2023-06-22T14:38:12Z","title":"Decentralized Multi-Agent Reinforcement Learning with Global State\n Prediction","summary":" Deep reinforcement learning (DRL) has seen remarkable success in the control\nof single robots. However, applying DRL to robot swarms presents significant\nchallenges. A critical challenge is non-stationarity, which occurs when two or\nmore robots update individual or shared policies concurrently, thereby engaging\nin an interdependent training process with no guarantees of convergence.\nCircumventing non-stationarity typically involves training the robots with\nglobal information about other agents' states and/or actions. In contrast, in\nthis paper we explore how to remove the need for global information. We pose\nour problem as a Partially Observable Markov Decision Process, due to the\nabsence of global knowledge on other agents. Using collective transport as a\ntestbed scenario, we study two approaches to multi-agent training. In the\nfirst, the robots exchange no messages, and are trained to rely on implicit\ncommunication through push-and-pull on the object to transport. In the second\napproach, we introduce Global State Prediction (GSP), a network trained to\nforma a belief over the swarm as a whole and predict its future states. We\nprovide a comprehensive study over four well-known deep reinforcement learning\nalgorithms in environments with obstacles, measuring performance as the\nsuccessful transport of the object to the goal within a desired time-frame.\nThrough an ablation study, we show that including GSP boosts performance and\nincreases robustness when compared with methods that use global knowledge.\n","authors":["Joshua Bloom","Pranjal Paliwal","Apratim Mukherjee","Carlo Pinciroli"],"pdf_url":"https://arxiv.org/pdf/2306.12926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07143v3","updated":"2023-08-28T17:23:46Z","published":"2023-01-17T19:15:06Z","title":"Revisiting mass-radius relationships for exoplanet populations: a\n machine learning insight","summary":" The growing number of exoplanet discoveries and advances in machine learning\ntechniques have opened new avenues for exploring and understanding the\ncharacteristics of worlds beyond our Solar System. In this study, we employ\nefficient machine learning approaches to analyze a dataset comprising 762\nconfirmed exoplanets and eight Solar System planets, aiming to characterize\ntheir fundamental quantities. By applying different unsupervised clustering\nalgorithms, we classify the data into two main classes: 'small' and 'giant'\nplanets, with cut-off values at $R_{p}=8.13R_{\\oplus}$ and\n$M_{p}=52.48M_{\\oplus}$. This classification reveals an intriguing distinction:\ngiant planets have lower densities, suggesting higher H-He mass fractions,\nwhile small planets are denser, composed mainly of heavier elements. We apply\nvarious regression models to uncover correlations between physical parameters\nand their predictive power for exoplanet radius. Our analysis highlights that\nplanetary mass, orbital period, and stellar mass play crucial roles in\npredicting exoplanet radius. Among the models evaluated, the Support Vector\nRegression consistently outperforms others, demonstrating its promise for\nobtaining accurate planetary radius estimates. Furthermore, we derive\nparametric equations using the M5P and Markov Chain Monte Carlo methods.\nNotably, our study reveals a noteworthy result: small planets exhibit a\npositive linear mass-radius relation, aligning with previous findings.\nConversely, for giant planets, we observe a strong correlation between\nplanetary radius and the mass of their host stars, which might provide\nintriguing insights into the relationship between giant planet formation and\nstellar characteristics.\n","authors":["Mahdiyar Mousavi-Sadr","Davood M. Jassur","Ghassem Gozaliasl"],"pdf_url":"https://arxiv.org/pdf/2301.07143v3.pdf","comment":"Accepted for publication in MNRAS. 17 pages, 18 figures"},{"id":"http://arxiv.org/abs/2308.14711v1","updated":"2023-08-28T17:11:41Z","published":"2023-08-28T17:11:41Z","title":"Fast Feedforward Networks","summary":" We break the linear link between the layer size and its inference cost by\nintroducing the fast feedforward (FFF) architecture, a logarithmic-time\nalternative to feedforward networks.\n We show that FFFs give comparable performance to feedforward networks at an\nexponential fraction of their inference cost, are quicker to deliver\nperformance compared to mixture-of-expert networks, and can readily take the\nplace of either in transformers.\n Pushing FFFs to the absolute limit, we train a vision transformer to perform\nsingle-neuron inferences at the cost of only 5.8% performance decrease against\nthe full-width variant.\n Our implementation is available as a Python package; just use \"pip install\nfastfeedforward\".\n","authors":["Peter Belcak","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2308.14711v1.pdf","comment":"12 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.14710v1","updated":"2023-08-28T17:10:12Z","published":"2023-08-28T17:10:12Z","title":"VideoCutLER: Surprisingly Simple Unsupervised Video Instance\n Segmentation","summary":" Existing approaches to unsupervised video instance segmentation typically\nrely on motion estimates and experience difficulties tracking small or\ndivergent motions. We present VideoCutLER, a simple method for unsupervised\nmulti-instance video segmentation without using motion-based learning signals\nlike optical flow or training on natural videos. Our key insight is that using\nhigh-quality pseudo masks and a simple video synthesis method for model\ntraining is surprisingly sufficient to enable the resulting video model to\neffectively segment and track multiple instances across video frames. We show\nthe first competitive unsupervised learning results on the challenging\nYouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous\nstate-of-the-art by a large margin. VideoCutLER can also serve as a strong\npretrained model for supervised video instance segmentation tasks, exceeding\nDINO by 15.9% on YouTubeVIS-2019 in terms of APvideo.\n","authors":["Xudong Wang","Ishan Misra","Ziyun Zeng","Rohit Girdhar","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2308.14710v1.pdf","comment":"Preprint. Code: https://github.com/facebookresearch/CutLER"},{"id":"http://arxiv.org/abs/2308.14705v1","updated":"2023-08-28T16:58:44Z","published":"2023-08-28T16:58:44Z","title":"Diversified Ensemble of Independent Sub-Networks for Robust\n Self-Supervised Representation Learning","summary":" Ensembling a neural network is a widely recognized approach to enhance model\nperformance, estimate uncertainty, and improve robustness in deep supervised\nlearning. However, deep ensembles often come with high computational costs and\nmemory demands. In addition, the efficiency of a deep ensemble is related to\ndiversity among the ensemble members which is challenging for large,\nover-parameterized deep neural networks. Moreover, ensemble learning has not\nyet seen such widespread adoption, and it remains a challenging endeavor for\nself-supervised or unsupervised representation learning. Motivated by these\nchallenges, we present a novel self-supervised training regime that leverages\nan ensemble of independent sub-networks, complemented by a new loss function\ndesigned to encourage diversity. Our method efficiently builds a sub-model\nensemble with high diversity, leading to well-calibrated estimates of model\nuncertainty, all achieved with minimal computational overhead compared to\ntraditional deep self-supervised ensembles. To evaluate the effectiveness of\nour approach, we conducted extensive experiments across various tasks,\nincluding in-distribution generalization, out-of-distribution detection,\ndataset corruption, and semi-supervised settings. The results demonstrate that\nour method significantly improves prediction reliability. Our approach not only\nachieves excellent accuracy but also enhances calibration, surpassing baseline\nperformance across a wide range of self-supervised architectures in computer\nvision, natural language processing, and genomics data.\n","authors":["Amirhossein Vahidi","Lisa Wimmer","Hüseyin Anil Gündüz","Bernd Bischl","Eyke Hüllermeier","Mina Rezaei"],"pdf_url":"https://arxiv.org/pdf/2308.14705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00915v3","updated":"2023-08-28T16:36:31Z","published":"2023-06-01T17:18:15Z","title":"The feasibility of artificial consciousness through the lens of\n neuroscience","summary":" Interactions with large language models have led to the suggestion that these\nmodels may soon be conscious. From the perspective of neuroscience, this\nposition is difficult to defend. For one, the inputs to large language models\nlack the embodied, embedded information content characteristic of our sensory\ncontact with the world around us. Secondly, the architecture of large language\nmodels is missing key features of the thalamocortical system that have been\nlinked to conscious awareness in mammals. Finally, the evolutionary and\ndevelopmental trajectories that led to the emergence of living conscious\norganisms arguably have no parallels in artificial systems as envisioned today.\nThe existence of living organisms depends on their actions, and their survival\nis intricately linked to multi-level cellular, inter-cellular, and organismal\nprocesses culminating in agency and consciousness.\n","authors":["Jaan Aru","Matthew Larkum","James M. Shine"],"pdf_url":"https://arxiv.org/pdf/2306.00915v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14693v1","updated":"2023-08-28T16:34:50Z","published":"2023-08-28T16:34:50Z","title":"Hybrid PLS-ML Authentication Scheme for V2I Communication Networks","summary":" Vehicular communication networks are rapidly emerging as vehicles become\nsmarter. However, these networks are increasingly susceptible to various\nattacks. The situation is exacerbated by the rise in automated vehicles\ncomplicates, emphasizing the need for security and authentication measures to\nensure safe and effective traffic management. In this paper, we propose a novel\nhybrid physical layer security (PLS)-machine learning (ML) authentication\nscheme by exploiting the position of the transmitter vehicle as a device\nfingerprint. We use a time-of-arrival (ToA) based localization mechanism where\nthe ToA is estimated at roadside units (RSUs), and the coordinates of the\ntransmitter vehicle are extracted at the base station (BS).Furthermore, to\ntrack the mobility of the moving legitimate vehicle, we use ML model trained on\nseveral system parameters. We try two ML models for this purpose, i.e., support\nvector regression and decision tree. To evaluate our scheme, we conduct binary\nhypothesis testing on the estimated positions with the help of the ground\ntruths provided by the ML model, which classifies the transmitter node as\nlegitimate or malicious. Moreover, we consider the probability of false alarm\nand the probability of missed detection as performance metrics resulting from\nthe binary hypothesis testing, and mean absolute error (MAE), mean square error\n(MSE), and coefficient of determination $\\text{R}^2$ to further evaluate the ML\nmodels. We also compare our scheme with a baseline scheme that exploits the\nangle of arrival at RSUs for authentication. We observe that our proposed\nposition-based mechanism outperforms the baseline scheme significantly in terms\nof missed detections.\n","authors":["Hala Amin","Jawaher Kaldari","Nora Mohamed","Waqas Aman","Saif Al-Kuwari"],"pdf_url":"https://arxiv.org/pdf/2308.14693v1.pdf","comment":"Accepted for Publication following Presentation at IEEE ISNCC-23"},{"id":"http://arxiv.org/abs/2308.14683v1","updated":"2023-08-28T16:18:50Z","published":"2023-08-28T16:18:50Z","title":"Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual\n Predatory Chats and Abusive Texts","summary":" Detecting online sexual predatory behaviours and abusive language on social\nmedia platforms has become a critical area of research due to the growing\nconcerns about online safety, especially for vulnerable populations such as\nchildren and adolescents. Researchers have been exploring various techniques\nand approaches to develop effective detection systems that can identify and\nmitigate these risks. Recent development of large language models (LLMs) has\nopened a new opportunity to address this problem more effectively. This paper\nproposes an approach to detection of online sexual predatory chats and abusive\nlanguage using the open-source pretrained Llama 2 7B-parameter model, recently\nreleased by Meta GenAI. We fine-tune the LLM using datasets with different\nsizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu).\nBased on the power of LLMs, our approach is generic and automated without a\nmanual search for a synergy between feature extraction and classifier design\nsteps like conventional methods in this domain. Experimental results show a\nstrong performance of the proposed approach, which performs proficiently and\nconsistently across three distinct datasets with five sets of experiments. This\nstudy's outcomes indicate that the proposed method can be implemented in\nreal-world applications (even with non-English languages) for flagging sexual\npredators, offensive or toxic content, hate speech, and discriminatory language\nin online discussions and comments to maintain respectful internet or digital\ncommunities. Furthermore, it can be employed for solving text classification\nproblems with other potential applications such as sentiment analysis, spam and\nphishing detection, sorting legal documents, fake news detection, language\nidentification, user intent recognition, text-based product categorization,\nmedical record analysis, and resume screening.\n","authors":["Thanh Thi Nguyen","Campbell Wilson","Janis Dalins"],"pdf_url":"https://arxiv.org/pdf/2308.14683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06243v2","updated":"2023-08-28T16:18:30Z","published":"2023-07-12T15:34:10Z","title":"Reconstructing Spatiotemporal Data with C-VAEs","summary":" The continuous representation of spatiotemporal data commonly relies on using\nabstract data types, such as \\textit{moving regions}, to represent entities\nwhose shape and position continuously change over time. Creating this\nrepresentation from discrete snapshots of real-world entities requires using\ninterpolation methods to compute in-between data representations and estimate\nthe position and shape of the object of interest at arbitrary temporal points.\nExisting region interpolation methods often fail to generate smooth and\nrealistic representations of a region's evolution. However, recent advancements\nin deep learning techniques have revealed the potential of deep models trained\non discrete observations to capture spatiotemporal dependencies through\nimplicit feature learning.\n In this work, we explore the capabilities of Conditional Variational\nAutoencoder (C-VAE) models to generate smooth and realistic representations of\nthe spatiotemporal evolution of moving regions. We evaluate our proposed\napproach on a sparsely annotated dataset on the burnt area of a forest fire. We\napply compression operations to sample from the dataset and use the C-VAE model\nand other commonly used interpolation algorithms to generate in-between region\nrepresentations. To evaluate the performance of the methods, we compare their\ninterpolation results with manually annotated data and regions generated by a\nU-Net model. We also assess the quality of generated data considering temporal\nconsistency metrics.\n The proposed C-VAE-based approach demonstrates competitive results in\ngeometric similarity metrics. It also exhibits superior temporal consistency,\nsuggesting that C-VAE models may be a viable alternative to modelling the\nspatiotemporal evolution of 2D moving regions.\n","authors":["Tiago F. R. Ribeiro","Fernando Silva","Rogério Luís de C. Costa"],"pdf_url":"https://arxiv.org/pdf/2307.06243v2.pdf","comment":"Update acknowledgments to include published article information"},{"id":"http://arxiv.org/abs/2211.11869v3","updated":"2023-08-28T16:14:57Z","published":"2022-11-21T21:42:50Z","title":"Examining Policy Entropy of Reinforcement Learning Agents for\n Personalization Tasks","summary":" This effort is focused on examining the behavior of reinforcement learning\nsystems in personalization environments and detailing the differences in policy\nentropy associated with the type of learning algorithm utilized. We demonstrate\nthat Policy Optimization agents often possess low-entropy policies during\ntraining, which in practice results in agents prioritizing certain actions and\navoiding others. Conversely, we also show that Q-Learning agents are far less\nsusceptible to such behavior and generally maintain high-entropy policies\nthroughout training, which is often preferable in real-world applications. We\nprovide a wide range of numerical experiments as well as theoretical\njustification to show that these differences in entropy are due to the type of\nlearning being employed.\n","authors":["Anton Dereventsov","Andrew Starnes","Clayton G. Webster"],"pdf_url":"https://arxiv.org/pdf/2211.11869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10145v3","updated":"2023-08-28T16:13:03Z","published":"2023-08-20T03:12:10Z","title":"Wasserstein Geodesic Generator for Conditional Distributions","summary":" Generating samples given a specific label requires estimating conditional\ndistributions. We derive a tractable upper bound of the Wasserstein distance\nbetween conditional distributions to lay the theoretical groundwork to learn\nconditional distributions. Based on this result, we propose a novel conditional\ngeneration algorithm where conditional distributions are fully characterized by\na metric space defined by a statistical distance. We employ optimal transport\ntheory to propose the Wasserstein geodesic generator, a new conditional\ngenerator that learns the Wasserstein geodesic. The proposed method learns both\nconditional distributions for observed domains and optimal transport maps\nbetween them. The conditional distributions given unobserved intermediate\ndomains are on the Wasserstein geodesic between conditional distributions given\ntwo observed domain labels. Experiments on face images with light conditions as\ndomain labels demonstrate the efficacy of the proposed method.\n","authors":["Young-geun Kim","Kyungbok Lee","Youngwon Choi","Joong-Ho Won","Myunghee Cho Paik"],"pdf_url":"https://arxiv.org/pdf/2308.10145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18160v2","updated":"2023-08-28T15:59:26Z","published":"2023-05-29T15:41:12Z","title":"Counterpart Fairness -- Addressing Systematic between-group Differences\n in Fairness Evaluation","summary":" When using machine learning (ML) to aid decision-making, it is critical to\nensure that an algorithmic decision is fair, i.e., it does not discriminate\nagainst specific individuals/groups, particularly those from underprivileged\npopulations. Existing group fairness methods require equal group-wise measures,\nwhich however fails to consider systematic between-group differences. The\nconfounding factors, which are non-sensitive variables but manifest systematic\ndifferences, can significantly affect fairness evaluation. To tackle this\nproblem, we believe that a fairness measurement should be based on the\ncomparison between counterparts (i.e., individuals who are similar to each\nother with respect to the task of interest) from different groups, whose group\nidentities cannot be distinguished algorithmically by exploring confounding\nfactors. We have developed a propensity-score-based method for identifying\ncounterparts, which prevents fairness evaluation from comparing \"oranges\" with\n\"apples\". In addition, we propose a counterpart-based statistical fairness\nindex, termed Counterpart-Fairness (CFair), to assess fairness of ML models.\nVarious empirical studies were conducted to validate the effectiveness of\nCFair. We publish our code at \\url{https://github.com/zhengyjo/CFair}.\n","authors":["Yifei Wang","Zhengyang Zhou","Liqin Wang","John Laurentiev","Peter Hou","Li Zhou","Pengyu Hong"],"pdf_url":"https://arxiv.org/pdf/2305.18160v2.pdf","comment":"25 pages, 6 figures, 16 tables"},{"id":"http://arxiv.org/abs/2305.02527v2","updated":"2023-08-28T15:52:36Z","published":"2023-05-04T03:31:30Z","title":"Reinforcement Learning with Delayed, Composite, and Partially Anonymous\n Reward","summary":" We investigate an infinite-horizon average reward Markov Decision Process\n(MDP) with delayed, composite, and partially anonymous reward feedback. The\ndelay and compositeness of rewards mean that rewards generated as a result of\ntaking an action at a given state are fragmented into different components, and\nthey are sequentially realized at delayed time instances. The partial anonymity\nattribute implies that a learner, for each state, only observes the aggregate\nof past reward components generated as a result of different actions taken at\nthat state, but realized at the observation instance. We propose an algorithm\nnamed $\\mathrm{DUCRL2}$ to obtain a near-optimal policy for this setting and\nshow that it achieves a regret bound of $\\tilde{\\mathcal{O}}\\left(DS\\sqrt{AT} +\nd (SA)^3\\right)$ where $S$ and $A$ are the sizes of the state and action\nspaces, respectively, $D$ is the diameter of the MDP, $d$ is a parameter upper\nbounded by the maximum reward delay, and $T$ denotes the time horizon. This\ndemonstrates the optimality of the bound in the order of $T$, and an additive\nimpact of the delay.\n","authors":["Washim Uddin Mondal","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2305.02527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14659v1","updated":"2023-08-28T15:41:30Z","published":"2023-08-28T15:41:30Z","title":"RESTORE: Graph Embedding Assessment Through Reconstruction","summary":" Following the success of Word2Vec embeddings, graph embeddings (GEs) have\ngained substantial traction. GEs are commonly generated and evaluated\nextrinsically on downstream applications, but intrinsic evaluations of the\noriginal graph properties in terms of topological structure and semantic\ninformation have been lacking. Understanding these will help identify the\ndeficiency of the various families of GE methods when vectorizing graphs in\nterms of preserving the relevant knowledge or learning incorrect knowledge. To\naddress this, we propose RESTORE, a framework for intrinsic GEs assessment\nthrough graph reconstruction. We show that reconstructing the original graph\nfrom the underlying GEs yields insights into the relative amount of information\npreserved in a given vector form. We first introduce the graph reconstruction\ntask. We generate GEs from three GE families based on factorization methods,\nrandom walks, and deep learning (with representative algorithms from each\nfamily) on the CommonSense Knowledge Graph (CSKG). We analyze their\neffectiveness in preserving the (a) topological structure of node-level graph\nreconstruction with an increasing number of hops and (b) semantic information\non various word semantic and analogy tests. Our evaluations show deep\nlearning-based GE algorithm (SDNE) is overall better at preserving (a) with a\nmean average precision (mAP) of 0.54 and 0.35 for 2 and 3-hop reconstruction\nrespectively, while the factorization-based algorithm (HOPE) is better at\nencapsulating (b) with an average Euclidean distance of 0.14, 0.17, and 0.11\nfor 1, 2, and 3-hop reconstruction respectively. The modest performance of\nthese GEs leaves room for further research avenues on better graph\nrepresentation learning.\n","authors":["Hong Yung Yip","Chidaksh Ravuru","Neelabha Banerjee","Shashwat Jha","Amit Sheth","Aman Chadha","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2308.14659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14658v1","updated":"2023-08-28T15:40:50Z","published":"2023-08-28T15:40:50Z","title":"Adversarial Predictions of Data Distributions Across Federated\n Internet-of-Things Devices","summary":" Federated learning (FL) is increasingly becoming the default approach for\ntraining machine learning models across decentralized Internet-of-Things (IoT)\ndevices. A key advantage of FL is that no raw data are communicated across the\nnetwork, providing an immediate layer of privacy. Despite this, recent works\nhave demonstrated that data reconstruction can be done with the locally trained\nmodel updates which are communicated across the network. However, many of these\nworks have limitations with regard to how the gradients are computed in\nbackpropagation. In this work, we demonstrate that the model weights shared in\nFL can expose revealing information about the local data distributions of IoT\ndevices. This leakage could expose sensitive information to malicious actors in\na distributed system. We further discuss results which show that injecting\nnoise into model weights is ineffective at preventing data leakage without\nseriously harming the global model accuracy.\n","authors":["Samir Rajani","Dario Dematties","Nathaniel Hudson","Kyle Chard","Nicola Ferrier","Rajesh Sankaran","Peter Beckman"],"pdf_url":"https://arxiv.org/pdf/2308.14658v1.pdf","comment":"6 pages, 6 figures, accepted for publication through 2023 IEEE World\n Forum on Internet of Things"},{"id":"http://arxiv.org/abs/2308.08086v2","updated":"2023-08-28T15:40:02Z","published":"2023-08-16T01:30:13Z","title":"Safety Filter Design for Neural Network Systems via Convex Optimization","summary":" With the increase in data availability, it has been widely demonstrated that\nneural networks (NN) can capture complex system dynamics precisely in a\ndata-driven manner. However, the architectural complexity and nonlinearity of\nthe NNs make it challenging to synthesize a provably safe controller. In this\nwork, we propose a novel safety filter that relies on convex optimization to\nensure safety for a NN system, subject to additive disturbances that are\ncapable of capturing modeling errors. Our approach leverages tools from NN\nverification to over-approximate NN dynamics with a set of linear bounds,\nfollowed by an application of robust linear MPC to search for controllers that\ncan guarantee robust constraint satisfaction. We demonstrate the efficacy of\nthe proposed framework numerically on a nonlinear pendulum system.\n","authors":["Shaoru Chen","Kong Yao Chee","Nikolai Matni","M. Ani Hsieh","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2308.08086v2.pdf","comment":"This paper has been accepted to the 2023 62nd IEEE Conference on\n Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2308.01674v2","updated":"2023-08-28T15:38:47Z","published":"2023-08-03T10:21:53Z","title":"End-to-End Reinforcement Learning of Koopman Models for Economic\n Nonlinear Model Predictive Control","summary":" (Economic) nonlinear model predictive control ((e)NMPC) requires dynamic\nsystem models that are sufficiently accurate in all relevant state-space\nregions. These models must also be computationally cheap enough to ensure\nreal-time tractability. Data-driven surrogate models for mechanistic models can\nbe used to reduce the computational burden of (e)NMPC; however, such models are\ntypically trained by system identification for maximum average prediction\naccuracy on simulation samples and perform suboptimally as part of actual\n(e)NMPC. We present a method for end-to-end reinforcement learning of dynamic\nsurrogate models for optimal performance in (e)NMPC applications, resulting in\npredictive controllers that strike a favorable balance between control\nperformance and computational demand. We validate our method on two\napplications derived from an established nonlinear continuous stirred-tank\nreactor model. We compare the controller performance to that of MPCs utilizing\nmodels trained by the prevailing maximum prediction accuracy paradigm, and\nmodel-free neural network controllers trained using reinforcement learning. We\nshow that our method matches the performance of the model-free neural network\ncontrollers while consistently outperforming models derived from system\nidentification. Additionally, we show that the MPC policies can react to\nchanges in the control setting without retraining.\n","authors":["Daniel Mayfrank","Alexander Mitsos","Manuel Dahmen"],"pdf_url":"https://arxiv.org/pdf/2308.01674v2.pdf","comment":"manuscript (18 pages, 7 figures, 5 tables), supplementary materials\n (3 pages, 2 tables)"},{"id":"http://arxiv.org/abs/2306.11167v2","updated":"2023-08-28T15:34:27Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v2.pdf","comment":"V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3\n (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2308.14650v1","updated":"2023-08-28T15:22:15Z","published":"2023-08-28T15:22:15Z","title":"Comparison of automated crater catalogs for Mars from Benedix et al.\n (2020) and Lee and Hogan (2021)","summary":" Crater mapping using neural networks and other automated methods has\nincreased recently with automated Crater Detection Algorithms (CDAs) applied to\nplanetary bodies throughout the solar system. A recent publication by Benedix\net al. (2020) showed high performance at small scales compared to similar\nautomated CDAs but with a net positive diameter bias in many crater candidates.\nI compare the publicly available catalogs from Benedix et al. (2020) and Lee &\nHogan (2021) and show that the reported performance is sensitive to the metrics\nused to test the catalogs. I show how the more permissive comparison methods\nindicate a higher CDA performance by allowing worse candidate craters to match\nground-truth craters. I show that the Benedix et al. (2020) catalog has a\nsubstantial performance loss with increasing latitude and identify an image\nprojection issue that might cause this loss. Finally, I suggest future\napplications of neural networks in generating large scientific datasets be\nvalidated using secondary networks with independent data sources or training\nmethods.\n","authors":["Christopher Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14650v1.pdf","comment":"14 pages, 6 figures. Accepted August 13th 2023"},{"id":"http://arxiv.org/abs/2308.14647v1","updated":"2023-08-28T15:19:18Z","published":"2023-08-28T15:19:18Z","title":"Edge Generation Scheduling for DAG Tasks using Deep Reinforcement\n Learning","summary":" Directed acyclic graph (DAG) tasks are currently adopted in the real-time\ndomain to model complex applications from the automotive, avionics, and\nindustrial domain that implement their functionalities through chains of\nintercommunicating tasks. This paper studies the problem of scheduling\nreal-time DAG tasks by presenting a novel schedulability test based on the\nconcept of trivial schedulability. Using this schedulability test, we propose a\nnew DAG scheduling framework (edge generation scheduling -- EGS) that attempts\nto minimize the DAG width by iteratively generating edges while guaranteeing\nthe deadline constraint. We study how to efficiently solve the problem of\ngenerating edges by developing a deep reinforcement learning algorithm combined\nwith a graph representation neural network to learn an efficient edge\ngeneration policy for EGS. We evaluate the effectiveness of the proposed\nalgorithm by comparing it with state-of-the-art DAG scheduling heuristics and\nan optimal mixed-integer linear programming baseline. Experimental results show\nthat the proposed algorithm outperforms the state-of-the-art by requiring fewer\nprocessors to schedule the same DAG tasks.\n","authors":["Binqi Sun","Mirco Theile","Ziyuan Qin","Daniele Bernardini","Debayan Roy","Andrea Bastoni","Marco Caccamo"],"pdf_url":"https://arxiv.org/pdf/2308.14647v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.14644v1","updated":"2023-08-28T15:16:35Z","published":"2023-08-28T15:16:35Z","title":"Human Comfortability Index Estimation in Industrial Human-Robot\n Collaboration Task","summary":" Fluent human-robot collaboration requires a robot teammate to understand,\nlearn, and adapt to the human's psycho-physiological state. Such collaborations\nrequire a computing system that monitors human physiological signals during\nhuman-robot collaboration (HRC) to quantitatively estimate a human's level of\ncomfort, which we have termed in this research as comfortability index (CI) and\nuncomfortability index (unCI). Subjective metrics (surprise, anxiety, boredom,\ncalmness, and comfortability) and physiological signals were collected during a\nhuman-robot collaboration experiment that varied robot behavior. The emotion\ncircumplex model is adapted to calculate the CI from the participant's\nquantitative data as well as physiological data. To estimate CI/unCI from\nphysiological signals, time features were extracted from electrocardiogram\n(ECG), galvanic skin response (GSR), and pupillometry signals. In this\nresearch, we successfully adapt the circumplex model to find the location\n(axis) of 'comfortability' and 'uncomfortability' on the circumplex model, and\nits location match with the closest emotions on the circumplex model. Finally,\nthe study showed that the proposed approach can estimate human\ncomfortability/uncomfortability from physiological signals.\n","authors":["Celal Savur","Jamison Heard","Ferat Sahin"],"pdf_url":"https://arxiv.org/pdf/2308.14644v1.pdf","comment":"Submitted to IEEE-THMS"},{"id":"http://arxiv.org/abs/2308.14642v1","updated":"2023-08-28T15:16:09Z","published":"2023-08-28T15:16:09Z","title":"Rate-Optimal Policy Optimization for Linear Markov Decision Processes","summary":" We study regret minimization in online episodic linear Markov Decision\nProcesses, and obtain rate-optimal $\\widetilde O (\\sqrt K)$ regret where $K$\ndenotes the number of episodes. Our work is the first to establish the optimal\n(w.r.t.~$K$) rate of convergence in the stochastic setting with bandit feedback\nusing a policy optimization based approach, and the first to establish the\noptimal (w.r.t.~$K$) rate in the adversarial setup with full information\nfeedback, for which no algorithm with an optimal rate guarantee is currently\nknown.\n","authors":["Uri Sherman","Alon Cohen","Tomer Koren","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2308.14642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14634v1","updated":"2023-08-28T15:04:16Z","published":"2023-08-28T15:04:16Z","title":"Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance","summary":" We propose the use of conversational GPT models for easy and quick few-shot\ntext classification in the financial domain using the Banking77 dataset. Our\napproach involves in-context learning with GPT-3.5 and GPT-4, which minimizes\nthe technical expertise required and eliminates the need for expensive GPU\ncomputing while yielding quick and accurate results. Additionally, we fine-tune\nother pre-trained, masked language models with SetFit, a recent contrastive\nlearning technique, to achieve state-of-the-art results both in full-data and\nfew-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can\noutperform fine-tuned, non-generative models even with fewer examples. However,\nsubscription fees associated with these solutions may be considered costly for\nsmall organizations. Lastly, we find that generative models perform better on\nthe given task when shown representative samples selected by a human expert\nrather than when shown random ones. We conclude that a) our proposed methods\noffer a practical solution for few-shot tasks in datasets with limited label\navailability, and b) our state-of-the-art results can inspire future work in\nthe area.\n","authors":["Lefteris Loukas","Ilias Stogiannidis","Prodromos Malakasiotis","Stavros Vassos"],"pdf_url":"https://arxiv.org/pdf/2308.14634v1.pdf","comment":"Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023"},{"id":"http://arxiv.org/abs/2308.14632v1","updated":"2023-08-28T14:57:29Z","published":"2023-08-28T14:57:29Z","title":"Comparing AutoML and Deep Learning Methods for Condition Monitoring\n using Realistic Validation Scenarios","summary":" This study extensively compares conventional machine learning methods and\ndeep learning for condition monitoring tasks using an AutoML toolbox. The\nexperiments reveal consistent high accuracy in random K-fold cross-validation\nscenarios across all tested models. However, when employing leave-one-group-out\n(LOGO) cross-validation on the same datasets, no clear winner emerges,\nindicating the presence of domain shift in real-world scenarios. Additionally,\nthe study assesses the scalability and interpretability of conventional methods\nand neural networks. Conventional methods offer explainability with their\nmodular structure aiding feature identification. In contrast, neural networks\nrequire specialized interpretation techniques like occlusion maps to visualize\nimportant regions in the input data. Finally, the paper highlights the\nsignificance of feature selection, particularly in condition monitoring tasks\nwith limited class variations. Low-complexity models prove sufficient for such\ntasks, as only a few features from the input signal are typically needed. In\nsummary, these findings offer crucial insights into the strengths and\nlimitations of various approaches, providing valuable benchmarks and\nidentifying the most suitable methods for condition monitoring applications,\nthereby enhancing their applicability in real-world scenarios.\n","authors":["Payman Goodarzi","Andreas Schütze","Tizian Schneider"],"pdf_url":"https://arxiv.org/pdf/2308.14632v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2308.14626v1","updated":"2023-08-28T14:48:49Z","published":"2023-08-28T14:48:49Z","title":"VesselShot: Few-shot learning for cerebral blood vessel segmentation","summary":" Angiography is widely used to detect, diagnose, and treat cerebrovascular\ndiseases. While numerous techniques have been proposed to segment the vascular\nnetwork from different imaging modalities, deep learning (DL) has emerged as a\npromising approach. However, existing DL methods often depend on proprietary\ndatasets and extensive manual annotation. Moreover, the availability of\npre-trained networks specifically for medical domains and 3D volumes is\nlimited. To overcome these challenges, we propose a few-shot learning approach\ncalled VesselShot for cerebrovascular segmentation. VesselShot leverages\nknowledge from a few annotated support images and mitigates the scarcity of\nlabeled data and the need for extensive annotation in cerebral blood vessel\nsegmentation. We evaluated the performance of VesselShot using the publicly\navailable TubeTK dataset for the segmentation task, achieving a mean Dice\ncoefficient (DC) of 0.62(0.03).\n","authors":["Mumu Aktar","Hassan Rivaz","Marta Kersten-Oertel","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02054v2","updated":"2023-08-28T14:38:33Z","published":"2023-05-03T11:39:31Z","title":"Map-based Experience Replay: A Memory-Efficient Solution to Catastrophic\n Forgetting in Reinforcement Learning","summary":" Deep Reinforcement Learning agents often suffer from catastrophic forgetting,\nforgetting previously found solutions in parts of the input space when training\non new data. Replay Memories are a common solution to the problem,\ndecorrelating and shuffling old and new training samples. They naively store\nstate transitions as they come in, without regard for redundancy. We introduce\na novel cognitive-inspired replay memory approach based on the\nGrow-When-Required (GWR) self-organizing network, which resembles a map-based\nmental model of the world. Our approach organizes stored transitions into a\nconcise environment-model-like network of state-nodes and transition-edges,\nmerging similar samples to reduce the memory size and increase pair-wise\ndistance among samples, which increases the relevancy of each sample. Overall,\nour paper shows that map-based experience replay allows for significant memory\nreduction with only small performance decreases.\n","authors":["Muhammad Burhan Hafez","Tilman Immisch","Tom Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2305.02054v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.05151v3","updated":"2023-08-28T14:38:11Z","published":"2022-11-09T19:02:40Z","title":"QuadConv: Quadrature-Based Convolutions with Applications to Non-Uniform\n PDE Data Compression","summary":" We present a new convolution layer for deep learning architectures which we\ncall QuadConv -- an approximation to continuous convolution via quadrature. Our\noperator is developed explicitly for use on non-uniform, mesh-based data, and\naccomplishes this by learning a continuous kernel that can be sampled at\narbitrary locations. Moreover, the construction of our operator admits an\nefficient implementation which we detail and construct. As an experimental\nvalidation of our operator, we consider the task of compressing partial\ndifferential equation (PDE) simulation data from fixed meshes. We show that\nQuadConv can match the performance of standard discrete convolutions on uniform\ngrid data by comparing a QuadConv autoencoder (QCAE) to a standard\nconvolutional autoencoder (CAE). Further, we show that the QCAE can maintain\nthis accuracy even on non-uniform data. In both cases, QuadConv also\noutperforms alternative unstructured convolution methods such as graph\nconvolution.\n","authors":["Kevin Doherty","Cooper Simpson","Stephen Becker","Alireza Doostan"],"pdf_url":"https://arxiv.org/pdf/2211.05151v3.pdf","comment":"26 pages, 18 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.19442v4","updated":"2023-08-28T14:29:19Z","published":"2023-05-30T22:30:30Z","title":"SimFBO: Towards Simple, Flexible and Communication-efficient Federated\n Bilevel Learning","summary":" Federated bilevel optimization (FBO) has shown great potential recently in\nmachine learning and edge computing due to the emerging nested optimization\nstructure in meta-learning, fine-tuning, hyperparameter tuning, etc. However,\nexisting FBO algorithms often involve complicated computations and require\nmultiple sub-loops per iteration, each of which contains a number of\ncommunication rounds. In this paper, we propose a simple and flexible FBO\nframework named SimFBO, which is easy to implement without sub-loops, and\nincludes a generalized server-side aggregation and update for improving\ncommunication efficiency. We further propose System-level heterogeneity robust\nFBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous\nlocal computation. We show that SimFBO and ShroFBO provably achieve a linear\nconvergence speedup with partial client participation and client sampling\nwithout replacement, as well as improved sample and communication complexities.\nExperiments demonstrate the effectiveness of the proposed methods over existing\nFBO algorithms.\n","authors":["Yifan Yang","Peiyao Xiao","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2305.19442v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14608v1","updated":"2023-08-28T14:23:04Z","published":"2023-08-28T14:23:04Z","title":"AI in the Gray: Exploring Moderation Policies in Dialogic Large Language\n Models vs. Human Answers in Controversial Topics","summary":" The introduction of ChatGPT and the subsequent improvement of Large Language\nModels (LLMs) have prompted more and more individuals to turn to the use of\nChatBots, both for information and assistance with decision-making. However,\nthe information the user is after is often not formulated by these ChatBots\nobjectively enough to be provided with a definite, globally accepted answer.\n Controversial topics, such as \"religion\", \"gender identity\", \"freedom of\nspeech\", and \"equality\", among others, can be a source of conflict as partisan\nor biased answers can reinforce preconceived notions or promote disinformation.\nBy exposing ChatGPT to such debatable questions, we aim to understand its level\nof awareness and if existing models are subject to socio-political and/or\neconomic biases. We also aim to explore how AI-generated answers compare to\nhuman ones. For exploring this, we use a dataset of a social media platform\ncreated for the purpose of debating human-generated claims on polemic subjects\namong users, dubbed Kialo.\n Our results show that while previous versions of ChatGPT have had important\nissues with controversial topics, more recent versions of ChatGPT\n(gpt-3.5-turbo) are no longer manifesting significant explicit biases in\nseveral knowledge areas. In particular, it is well-moderated regarding economic\naspects. However, it still maintains degrees of implicit libertarian leaning\ntoward right-winged ideals which suggest the need for increased moderation from\nthe socio-political point of view. In terms of domain knowledge on\ncontroversial topics, with the exception of the \"Philosophical\" category,\nChatGPT is performing well in keeping up with the collective human level of\nknowledge. Finally, we see that sources of Bing AI have slightly more tendency\nto the center when compared to human answers. All the analyses we make are\ngeneralizable to other types of biases and domains.\n","authors":["Vahid Ghafouri","Vibhor Agarwal","Yong Zhang","Nishanth Sastry","Jose Such","Guillermo Suarez-Tangil"],"pdf_url":"https://arxiv.org/pdf/2308.14608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14606v1","updated":"2023-08-28T14:20:53Z","published":"2023-08-28T14:20:53Z","title":"On the Tradeoff between Privacy Preservation and Byzantine-Robustness in\n Decentralized Learning","summary":" This paper jointly considers privacy preservation and Byzantine-robustness in\ndecentralized learning. In a decentralized network, honest-but-curious agents\nfaithfully follow the prescribed algorithm, but expect to infer their\nneighbors' private data from messages received during the learning process,\nwhile dishonest-and-Byzantine agents disobey the prescribed algorithm, and\ndeliberately disseminate wrong messages to their neighbors so as to bias the\nlearning process. For this novel setting, we investigate a generic\nprivacy-preserving and Byzantine-robust decentralized stochastic gradient\ndescent (SGD) framework, in which Gaussian noise is injected to preserve\nprivacy and robust aggregation rules are adopted to counteract Byzantine\nattacks. We analyze its learning error and privacy guarantee, discovering an\nessential tradeoff between privacy preservation and Byzantine-robustness in\ndecentralized learning -- the learning error caused by defending against\nByzantine attacks is exacerbated by the Gaussian noise added to preserve\nprivacy. Numerical experiments are conducted and corroborate our theoretical\nfindings.\n","authors":["Haoxiang Ye","Heng Zhu","Qing Ling"],"pdf_url":"https://arxiv.org/pdf/2308.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14602v1","updated":"2023-08-28T14:12:52Z","published":"2023-08-28T14:12:52Z","title":"Recent Progress in Energy Management of Connected Hybrid Electric\n Vehicles Using Reinforcement Learning","summary":" The growing adoption of hybrid electric vehicles (HEVs) presents a\ntransformative opportunity for revolutionizing transportation energy systems.\nThe shift towards electrifying transportation aims to curb environmental\nconcerns related to fossil fuel consumption. This necessitates efficient energy\nmanagement systems (EMS) to optimize energy efficiency. The evolution of EMS\nfrom HEVs to connected hybrid electric vehicles (CHEVs) represent a pivotal\nshift. For HEVs, EMS now confronts the intricate energy cooperation\nrequirements of CHEVs, necessitating advanced algorithms for route\noptimization, charging coordination, and load distribution. Challenges persist\nin both domains, including optimal energy utilization for HEVs, and cooperative\neco-driving control (CED) for CHEVs across diverse vehicle types. Reinforcement\nlearning (RL) stands out as a promising tool for addressing these challenges at\nhand. Specifically, within the realm of CHEVs, the application of multi-agent\nreinforcement learning (MARL) emerges as a powerful approach for effectively\ntackling the intricacies of CED control. Despite extensive research, few\nreviews span from individual vehicles to multi-vehicle scenarios. This review\nbridges the gap, highlighting challenges, advancements, and potential\ncontributions of RL-based solutions for future sustainable transportation\nsystems.\n","authors":["Min Hua","Bin Shuai","Quan Zhou","Jinhai Wang","Yinglong He","Hongming Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14601v1","updated":"2023-08-28T14:12:25Z","published":"2023-08-28T14:12:25Z","title":"Fairness Through Domain Awareness: Mitigating Popularity Bias For Music\n Discovery","summary":" As online music platforms grow, music recommender systems play a vital role\nin helping users navigate and discover content within their vast musical\ndatabases. At odds with this larger goal, is the presence of popularity bias,\nwhich causes algorithmic systems to favor mainstream content over, potentially\nmore relevant, but niche items. In this work we explore the intrinsic\nrelationship between music discovery and popularity bias. To mitigate this\nissue we propose a domain-aware, individual fairness-based approach which\naddresses popularity bias in graph neural network (GNNs) based recommender\nsystems. Our approach uses individual fairness to reflect a ground truth\nlistening experience, i.e., if two songs sound similar, this similarity should\nbe reflected in their representations. In doing so, we facilitate meaningful\nmusic discovery that is robust to popularity bias and grounded in the music\ndomain. We apply our BOOST methodology to two discovery based tasks, performing\nrecommendations at both the playlist level and user level. Then, we ground our\nevaluation in the cold start setting, showing that our approach outperforms\nexisting fairness benchmarks in both performance and recommendation of\nlesser-known content. Finally, our analysis explains why our proposed\nmethodology is a novel and promising approach to mitigating popularity bias and\nimproving the discovery of new and niche content in music recommender systems.\n","authors":["Rebecca Salganik","Fernando Diaz","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2308.14601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12561v2","updated":"2023-08-28T14:10:50Z","published":"2022-12-23T19:37:39Z","title":"An active learning method for solving competitive multi-agent\n decision-making and control problems","summary":" We propose a scheme based on active learning to reconstruct private\nstrategies executed by a population of interacting agents and predict an exact\noutcome of the underlying multi-agent interaction process, here identified as a\nstationary action profile. We envision a scenario where an external observer,\nendowed with a learning procedure, can make queries and observe the agents'\nreactions through private action-reaction mappings, whose collective fixed\npoint corresponds to a stationary profile. By iteratively collecting sensible\ndata and updating parametric estimates of the action-reaction mappings, we\nestablish sufficient conditions to assess the asymptotic properties of the\nproposed active learning methodology so that, if convergence happens, it can\nonly be towards a stationary action profile. This fact yields two main\nconsequences: i) learning locally-exact surrogates of the action-reaction\nmappings allows the external observer to succeed in its prediction task, and\nii) working with assumptions so general that a stationary profile is not even\nguaranteed to exist, the established sufficient conditions hence act also as\ncertificates for the existence of such a desirable profile. Extensive numerical\nsimulations involving typical competitive multi-agent control and\ndecision-making problems illustrate the practical effectiveness of the proposed\nlearning-based approach.\n","authors":["Filippo Fabiani","Alberto Bemporad"],"pdf_url":"https://arxiv.org/pdf/2212.12561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13368v2","updated":"2023-08-28T14:09:47Z","published":"2022-12-27T05:51:54Z","title":"Deep Reinforcement Learning for Wind and Energy Storage Coordination in\n Wholesale Energy and Ancillary Service Markets","summary":" Wind energy has been increasingly adopted to mitigate climate change.\nHowever, the variability of wind energy causes wind curtailment, resulting in\nconsiderable economic losses for wind farm owners. Wind curtailment can be\nreduced using battery energy storage systems (BESS) as onsite backup sources.\nYet, this auxiliary role may significantly weaken the economic potential of\nBESS in energy trading. Ideal BESS scheduling should balance onsite wind\ncurtailment reduction and market bidding, but practical implementation is\nchallenging due to coordination complexity and the stochastic nature of energy\nprices and wind generation. We investigate the joint-market bidding strategy of\na co-located wind-battery system in the spot and Regulation Frequency Control\nAncillary Service markets. We propose a novel deep reinforcement learning-based\napproach that decouples the system's market participation into two related\nMarkov decision processes for each facility, enabling the BESS to absorb onsite\nwind curtailment while performing joint-market bidding to maximize overall\noperational revenues. Using realistic wind farm data, we validated the\ncoordinated bidding strategy, with outcomes surpassing the optimization-based\nbenchmark in terms of higher revenue by approximately 25\\% and more wind\ncurtailment reduction by 2.3 times. Our results show that joint-market bidding\ncan significantly improve the financial performance of wind-battery systems\ncompared to participating in each market separately. Simulations also show that\nusing curtailed wind generation as a power source for charging the BESS can\nlead to additional financial gains. The successful implementation of our\nalgorithm would encourage co-location of generation and storage assets to\nunlock wider system benefits.\n","authors":["Jinhao Li","Changlong Wang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2212.13368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14597v1","updated":"2023-08-28T14:09:02Z","published":"2023-08-28T14:09:02Z","title":"Adversarial Attacks on Foundational Vision Models","summary":" Rapid progress is being made in developing large, pretrained, task-agnostic\nfoundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are\napproaching the point where these models do not have to be finetuned\ndownstream, and can simply be used in zero-shot or with a lightweight probing\nhead. Critically, given the complexity of working at this scale, there is a\nbottleneck where relatively few organizations in the world are executing the\ntraining then sharing the models on centralized platforms such as HuggingFace\nand torch.hub. The goal of this work is to identify several key adversarial\nvulnerabilities of these models in an effort to make future designs more\nrobust. Intuitively, our attacks manipulate deep feature representations to\nfool an out-of-distribution (OOD) detector which will be required when using\nthese open-world-aware models to solve closed-set downstream tasks. Our methods\nreliably make in-distribution (ID) images (w.r.t. a downstream task) be\npredicted as OOD and vice versa while existing in extremely\nlow-knowledge-assumption threat models. We show our attacks to be potent in\nwhitebox and blackbox settings, as well as when transferred across foundational\nmodel types (e.g., attack DINOv2 with CLIP)! This work is only just the\nbeginning of a long journey towards adversarially robust foundational vision\nmodels.\n","authors":["Nathan Inkawhich","Gwendolyn McDonald","Ryan Luley"],"pdf_url":"https://arxiv.org/pdf/2308.14597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14596v1","updated":"2023-08-28T14:08:42Z","published":"2023-08-28T14:08:42Z","title":"LatentDR: Improving Model Generalization Through Sample-Aware Latent\n Degradation and Restoration","summary":" Despite significant advances in deep learning, models often struggle to\ngeneralize well to new, unseen domains, especially when training data is\nlimited. To address this challenge, we propose a novel approach for\ndistribution-aware latent augmentation that leverages the relationships across\nsamples to guide the augmentation procedure. Our approach first degrades the\nsamples stochastically in the latent space, mapping them to augmented labels,\nand then restores the samples from their corrupted versions during training.\nThis process confuses the classifier in the degradation step and restores the\noverall class distribution of the original samples, promoting diverse\nintra-class/cross-domain variability. We extensively evaluate our approach on a\ndiverse set of datasets and tasks, including domain generalization benchmarks\nand medical imaging datasets with strong domain shift, where we show our\napproach achieves significant improvements over existing methods for latent\nspace augmentation. We further show that our method can be flexibly adapted to\nlong-tail recognition tasks, demonstrating its versatility in building more\ngeneralizable models. Code is available at\nhttps://github.com/nerdslab/LatentDR.\n","authors":["Ran Liu","Sahil Khose","Jingyun Xiao","Lakshmi Sathidevi","Keerthan Ramnath","Zsolt Kira","Eva L. Dyer"],"pdf_url":"https://arxiv.org/pdf/2308.14596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14595v1","updated":"2023-08-28T14:06:36Z","published":"2023-08-28T14:06:36Z","title":"Neural Network Training Strategy to Enhance Anomaly Detection\n Performance: A Perspective on Reconstruction Loss Amplification","summary":" Unsupervised anomaly detection (UAD) is a widely adopted approach in industry\ndue to rare anomaly occurrences and data imbalance. A desirable characteristic\nof an UAD model is contained generalization ability which excels in the\nreconstruction of seen normal patterns but struggles with unseen anomalies.\nRecent studies have pursued to contain the generalization capability of their\nUAD models in reconstruction from different perspectives, such as design of\nneural network (NN) structure and training strategy. In contrast, we note that\ncontaining of generalization ability in reconstruction can also be obtained\nsimply from steep-shaped loss landscape. Motivated by this, we propose a loss\nlandscape sharpening method by amplifying the reconstruction loss, dubbed Loss\nAMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the\nreconstruction error on unseen anomalies becomes greater. Accordingly, the\nanomaly detection performance is improved without any change of the NN\narchitecture. Our findings suggest that LAMP can be easily applied to any\nreconstruction error metrics in UAD settings where the reconstruction model is\ntrained with anomaly-free samples only.\n","authors":["YeongHyeon Park","Sungho Kang","Myung Jin Kim","Hyeonho Jeong","Hyunkyu Park","Hyeong Seok Kim","Juneho Yi"],"pdf_url":"https://arxiv.org/pdf/2308.14595v1.pdf","comment":"5 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.10842v2","updated":"2023-08-28T13:55:12Z","published":"2023-08-18T09:45:21Z","title":"Enhancing Agent Communication and Learning through Action and Language","summary":" We introduce a novel category of GC-agents capable of functioning as both\nteachers and learners. Leveraging action-based demonstrations and\nlanguage-based instructions, these agents enhance communication efficiency. We\ninvestigate the incorporation of pedagogy and pragmatism, essential elements in\nhuman communication and goal achievement, enhancing the agents' teaching and\nlearning capabilities. Furthermore, we explore the impact of combining\ncommunication modes (action and language) on learning outcomes, highlighting\nthe benefits of a multi-modal approach.\n","authors":["Hugo Caselles-Dupré","Olivier Sigaud","Mohamed Chetouani"],"pdf_url":"https://arxiv.org/pdf/2308.10842v2.pdf","comment":"IMOL workshop, Paris 2023"},{"id":"http://arxiv.org/abs/2308.14555v1","updated":"2023-08-28T13:17:39Z","published":"2023-08-28T13:17:39Z","title":"Kernel Limit of Recurrent Neural Networks Trained on Ergodic Data\n Sequences","summary":" Mathematical methods are developed to characterize the asymptotics of\nrecurrent neural networks (RNN) as the number of hidden units, data samples in\nthe sequence, hidden state updates, and training steps simultaneously grow to\ninfinity. In the case of an RNN with a simplified weight matrix, we prove the\nconvergence of the RNN to the solution of an infinite-dimensional ODE coupled\nwith the fixed point of a random algebraic equation. The analysis requires\naddressing several challenges which are unique to RNNs. In typical mean-field\napplications (e.g., feedforward neural networks), discrete updates are of\nmagnitude $\\mathcal{O}(\\frac{1}{N})$ and the number of updates is\n$\\mathcal{O}(N)$. Therefore, the system can be represented as an Euler\napproximation of an appropriate ODE/PDE, which it will converge to as $N\n\\rightarrow \\infty$. However, the RNN hidden layer updates are\n$\\mathcal{O}(1)$. Therefore, RNNs cannot be represented as a discretization of\nan ODE/PDE and standard mean-field techniques cannot be applied. Instead, we\ndevelop a fixed point analysis for the evolution of the RNN memory states, with\nconvergence estimates in terms of the number of update steps and the number of\nhidden units. The RNN hidden layer is studied as a function in a Sobolev space,\nwhose evolution is governed by the data sequence (a Markov chain), the\nparameter updates, and its dependence on the RNN hidden layer at the previous\ntime step. Due to the strong correlation between updates, a Poisson equation\nmust be used to bound the fluctuations of the RNN around its limit equation.\nThese mathematical methods give rise to the neural tangent kernel (NTK) limits\nfor RNNs trained on data sequences as the number of data samples and size of\nthe neural network grow to infinity.\n","authors":["Samuel Chun-Hei Lam","Justin Sirignano","Konstantinos Spiliopoulos"],"pdf_url":"https://arxiv.org/pdf/2308.14555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05249v2","updated":"2023-08-28T13:00:38Z","published":"2022-05-11T03:36:04Z","title":"Secure & Private Federated Neuroimaging","summary":" The amount of biomedical data continues to grow rapidly. However, collecting\ndata from multiple sites for joint analysis remains challenging due to\nsecurity, privacy, and regulatory concerns. To overcome this challenge, we use\nFederated Learning, which enables distributed training of neural network models\nover multiple data sources without sharing data. Each site trains the neural\nnetwork over its private data for some time, then shares the neural network\nparameters (i.e., weights, gradients) with a Federation Controller, which in\nturn aggregates the local models, sends the resulting community model back to\neach site, and the process repeats. Our Federated Learning architecture,\nMetisFL, provides strong security and privacy. First, sample data never leaves\na site. Second, neural network parameters are encrypted before transmission and\nthe global neural model is computed under fully-homomorphic encryption.\nFinally, we use information-theoretic methods to limit information leakage from\nthe neural model to prevent a curious site from performing model inversion or\nmembership attacks. We present a thorough evaluation of the performance of\nsecure, private federated learning in neuroimaging tasks, including for\npredicting Alzheimer's disease and estimating BrainAGE from magnetic resonance\nimaging (MRI) studies, in challenging, heterogeneous federated environments\nwhere sites have different amounts of data and statistical distributions.\n","authors":["Dimitris Stripelis","Umang Gupta","Hamza Saleem","Nikhil Dhinagar","Tanmay Ghai","Rafael Chrysovalantis Anastasiou","Armaghan Asghar","Greg Ver Steeg","Srivatsan Ravi","Muhammad Naveed","Paul M. Thompson","Jose Luis Ambite"],"pdf_url":"https://arxiv.org/pdf/2205.05249v2.pdf","comment":"18 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.03854v3","updated":"2023-08-28T12:50:34Z","published":"2023-07-07T22:00:31Z","title":"inTformer: A Time-Embedded Attention-Based Transformer for Crash\n Likelihood Prediction at Intersections Using Connected Vehicle Data","summary":" The real-time crash likelihood prediction model is an essential component of\nthe proactive traffic safety management system. Over the years, numerous\nstudies have attempted to construct a crash likelihood prediction model in\norder to enhance traffic safety, but mostly on freeways. In the majority of the\nexisting studies, researchers have primarily employed a deep learning-based\nframework to identify crash potential. Lately, Transformer has emerged as a\npotential deep neural network that fundamentally operates through\nattention-based mechanisms. Transformer has several functional benefits over\nextant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can\nreadily handle long-term dependencies in a data sequence. Secondly,\nTransformers can parallelly process all elements in a data sequence during\ntraining. Finally, a Transformer does not have the vanishing gradient issue.\nRealizing the immense possibility of Transformers, this paper proposes\ninTersection-Transformer (inTformer), a time-embedded attention-based\nTransformer model that can effectively predict intersection crash likelihood in\nreal-time. The proposed model was evaluated using connected vehicle data\nextracted from Signal Analytics Platform. Acknowledging the complex traffic\noperation mechanism at intersection, this study developed zone-specific models\nby dividing the intersection region into two distinct zones:\nwithin-intersection and approach zone. The best inTformer models in\n'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and\n70%, respectively. The zone-level models were also compared to earlier studies\non crash likelihood prediction at intersections and with several established\ndeep learning models trained on the same connected vehicle dataset.\n","authors":["B M Tazbiul Hassan Anik","Zubayer Islam","Mohamed Abdel-Aty","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2307.03854v3.pdf","comment":"29 pages, 10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2308.14536v1","updated":"2023-08-28T12:47:41Z","published":"2023-08-28T12:47:41Z","title":"Spoken Language Intelligence of Large Language Models for Language\n Learning","summary":" People have long hoped for a conversational system that can assist in\nreal-life situations, and recent progress on large language models (LLMs) is\nbringing this idea closer to reality. While LLMs are often impressive in\nperformance, their efficacy in real-world scenarios that demand expert\nknowledge remains unclear. LLMs are believed to hold the most potential and\nvalue in education, especially in the development of Artificial intelligence\n(AI) based virtual teachers capable of facilitating language learning. Our\nfocus is centered on evaluating the efficacy of LLMs in the realm of education,\nspecifically in the areas of spoken language learning which encompass\nphonetics, phonology, and second language acquisition. We introduce a new\nmultiple-choice question dataset to evaluate the effectiveness of LLMs in the\naforementioned scenarios, including understanding and application of spoken\nlanguage knowledge. In addition, we investigate the influence of various\nprompting techniques such as zero- and few-shot method (prepending the question\nwith question-answer exemplars), chain-of-thought (CoT, think step-by-step),\nin-domain exampler and external tools (Google, Wikipedia). We conducted\nlarge-scale evaluation on popular LLMs (20 distinct models) using these\nmethods. We achieved significant performance improvements compared to the\nzero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% ->\n63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different\nsizes have good understanding of concepts in phonetics, phonology, and second\nlanguage acquisition, but show limitations in reasoning for real-world\nproblems. Additionally, we also explore preliminary findings on conversational\ncommunication.\n","authors":["Linkai Peng","Baorian Nuchged","Yingming Gao"],"pdf_url":"https://arxiv.org/pdf/2308.14536v1.pdf","comment":"28 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2308.14522v1","updated":"2023-08-28T12:17:51Z","published":"2023-08-28T12:17:51Z","title":"Large Graph Models: A Perspective","summary":" Large models have emerged as the most recent groundbreaking achievements in\nartificial intelligence, and particularly machine learning. However, when it\ncomes to graphs, large models have not achieved the same level of success as in\nother fields, such as natural language processing and computer vision. In order\nto promote applying large models for graphs forward, we present a perspective\npaper to discuss the challenges and opportunities associated with developing\nlarge graph models. First, we discuss the desired characteristics of large\ngraph models. Then, we present detailed discussions from three key\nperspectives: representation basis, graph data, and graph models. In each\ncategory, we provide a brief overview of recent advances and highlight the\nremaining challenges together with our visions. Finally, we discuss valuable\napplications of large graph models. We believe this perspective paper is able\nto encourage further investigations into large graph models, ultimately pushing\nus one step closer towards artificial general intelligence (AGI).\n","authors":["Ziwei Zhang","Haoyang Li","Zeyang Zhang","Yijian Qin","Xin Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14522v1.pdf","comment":"Preliminary version. Comments are welcome"},{"id":"http://arxiv.org/abs/2308.14521v1","updated":"2023-08-28T12:13:36Z","published":"2023-08-28T12:13:36Z","title":"Context-Aware Composition of Agent Policies by Markov Decision Process\n Entity Embeddings and Agent Ensembles","summary":" Computational agents support humans in many areas of life and are therefore\nfound in heterogeneous contexts. This means that agents operate in rapidly\nchanging environments and can be confronted with huge state and action spaces.\nIn order to perform services and carry out activities in a goal-oriented\nmanner, agents require prior knowledge and therefore have to develop and pursue\ncontext-dependent policies. The problem is that prescribing policies in advance\nis limited and inflexible, especially in dynamically changing environments.\nMoreover, the context of an agent determines its choice of actions. Since the\nenvironments in which agents operate can be stochastic and complex in terms of\nthe number of states and feasible actions, activities are usually modelled in a\nsimplified way by Markov decision processes so that agents with reinforcement\nlearning are able to learn policies that help to capture the context and act\naccordingly to optimally perform activities. However, training policies for all\npossible contexts using reinforcement learning is time-consuming. A requirement\nand challenge for agents is to learn strategies quickly and respond immediately\nin cross-context environments and applications. In this work, we propose a\nnovel simulation-based approach that enables a) the representation of\nheterogeneous contexts through knowledge graphs and entity embeddings and b)\nthe context-aware composition of policies on demand by ensembles of agents\nrunning in parallel. The evaluation we performed on the \"Virtual Home\" dataset\nindicates that agents that need to seamlessly switch between different\ncontexts, can request on-the-fly composed policies that lead to the successful\ncompletion of context-appropriate activities without having to learn these\npolicies in lengthy training steps and episodes, in contrast to agents that\napply reinforcement learning.\n","authors":["Nicole Merkle","Ralf Mikut"],"pdf_url":"https://arxiv.org/pdf/2308.14521v1.pdf","comment":"29 pages, 11 figures, 9 tables, 3 listings, Submitted to Semantic Web\n Journal, Under revision for re-submission to Semantic Web Journal"},{"id":"http://arxiv.org/abs/2201.08110v2","updated":"2023-08-28T12:04:46Z","published":"2022-01-20T10:57:20Z","title":"NNP/MM: Accelerating molecular dynamics simulations with machine\n learning potentials and molecular mechanic","summary":" Machine learning potentials have emerged as a means to enhance the accuracy\nof biomolecular simulations. However, their application is constrained by the\nsignificant computational cost arising from the vast number of parameters\ncompared to traditional molecular mechanics. To tackle this issue, we introduce\nan optimized implementation of the hybrid method (NNP/MM), which combines\nneural network potentials (NNP) and molecular mechanics (MM). This approach\nmodels a portion of the system, such as a small molecule, using NNP while\nemploying MM for the remaining system to boost efficiency. By conducting\nmolecular dynamics (MD) simulations on various protein-ligand complexes and\nmetadynamics (MTD) simulations on a ligand, we showcase the capabilities of our\nimplementation of NNP/MM. It has enabled us to increase the simulation speed by\n5 times and achieve a combined sampling of one microsecond for each complex,\nmarking the longest simulations ever reported for this class of simulation.\n","authors":["Raimondas Galvelis","Alejandro Varela-Rial","Stefan Doerr","Roberto Fino","Peter Eastman","Thomas E. Markland","John D. Chodera","Gianni De Fabritiis"],"pdf_url":"https://arxiv.org/pdf/2201.08110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14516v1","updated":"2023-08-28T12:03:03Z","published":"2023-08-28T12:03:03Z","title":"Prediction of Tourism Flow with Sparse Geolocation Data","summary":" Modern tourism in the 21st century is facing numerous challenges. Among these\nthe rapidly growing number of tourists visiting space-limited regions like\nhistorical cities, museums and bottlenecks such as bridges is one of the\nbiggest. In this context, a proper and accurate prediction of tourism volume\nand tourism flow within a certain area is important and critical for visitor\nmanagement tasks such as sustainable treatment of the environment and\nprevention of overcrowding. Static flow control methods like conventional\nlow-level controllers or limiting access to overcrowded venues could not solve\nthe problem yet. In this paper, we empirically evaluate the performance of\nstate-of-the-art deep-learning methods such as RNNs, GNNs, and Transformers as\nwell as the classic statistical ARIMA method. Granular limited data supplied by\na tourism region is extended by exogenous data such as geolocation trajectories\nof individual tourists, weather and holidays. In the field of visitor flow\nprediction with sparse data, we are thereby capable of increasing the accuracy\nof our predictions, incorporating modern input feature handling as well as\nmapping geolocation data on top of discrete POI data.\n","authors":["Julian Lemmel","Zahra Babaiee","Marvin Kleinlehner","Ivan Majic","Philipp Neubauer","Johannes Scholz","Radu Grosu","Sophie A. Neubauer"],"pdf_url":"https://arxiv.org/pdf/2308.14516v1.pdf","comment":"Accepted for publication at the proceedings of the 5th International\n Data Science Conference - iDSC2023. arXiv admin note: substantial text\n overlap with arXiv:2206.13274"},{"id":"http://arxiv.org/abs/2308.14507v1","updated":"2023-08-28T11:49:23Z","published":"2023-08-28T11:49:23Z","title":"Spectral Estimators for Structured Generalized Linear Models via\n Approximate Message Passing","summary":" We consider the problem of parameter estimation from observations given by a\ngeneralized linear model. Spectral methods are a simple yet effective approach\nfor estimation: they estimate the parameter via the principal eigenvector of a\nmatrix obtained by suitably preprocessing the observations. Despite their wide\nuse, a rigorous performance characterization of spectral estimators, as well as\na principled way to preprocess the data, is available only for unstructured\n(i.e., i.i.d. Gaussian and Haar) designs. In contrast, real-world design\nmatrices are highly structured and exhibit non-trivial correlations. To address\nthis problem, we consider correlated Gaussian designs which capture the\nanisotropic nature of the measurements via a feature covariance matrix\n$\\Sigma$. Our main result is a precise asymptotic characterization of the\nperformance of spectral estimators in this setting. This then allows to\nidentify the optimal preprocessing that minimizes the number of samples needed\nto meaningfully estimate the parameter. Remarkably, such an optimal spectral\nestimator depends on $\\Sigma$ only through its normalized trace, which can be\nconsistently estimated from the data. Numerical results demonstrate the\nadvantage of our principled approach over previous heuristic methods.\n Existing analyses of spectral estimators crucially rely on the rotational\ninvariance of the design matrix. This key assumption does not hold for\ncorrelated Gaussian designs. To circumvent this difficulty, we develop a novel\nstrategy based on designing and analyzing an approximate message passing\nalgorithm whose fixed point coincides with the desired spectral estimator. Our\nmethodology is general, and opens the way to the precise characterization of\nspiked matrices and of the corresponding spectral methods in a variety of\nsettings.\n","authors":["Yihan Zhang","Hong Chang Ji","Ramji Venkataramanan","Marco Mondelli"],"pdf_url":"https://arxiv.org/pdf/2308.14507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14486v1","updated":"2023-08-28T10:59:05Z","published":"2023-08-28T10:59:05Z","title":"Rebalancing Social Feed to Minimize Polarization and Disagreement","summary":" Social media have great potential for enabling public discourse on important\nsocietal issues. However, adverse effects, such as polarization and echo\nchambers, greatly impact the benefits of social media and call for algorithms\nthat mitigate these effects. In this paper, we propose a novel problem\nformulation aimed at slightly nudging users' social feeds in order to strike a\nbalance between relevance and diversity, thus mitigating the emergence of\npolarization, without lowering the quality of the feed. Our approach is based\non re-weighting the relative importance of the accounts that a user follows, so\nas to calibrate the frequency with which the content produced by various\naccounts is shown to the user. We analyze the convexity properties of the\nproblem, demonstrating the non-matrix convexity of the objective function and\nthe convexity of the feasible set. To efficiently address the problem, we\ndevelop a scalable algorithm based on projected gradient descent. We also prove\nthat our problem statement is a proper generalization of the undirected-case\nproblem so that our method can also be adopted for undirected social networks.\nAs a baseline for comparison in the undirected case, we develop a semidefinite\nprogramming approach, which provides the optimal solution. Through extensive\nexperiments on synthetic and real-world datasets, we validate the effectiveness\nof our approach, which outperforms non-trivial baselines, underscoring its\nability to foster healthier and more cohesive online communities.\n","authors":["Federico Cinus","Aristides Gionis","Francesco Bonchi"],"pdf_url":"https://arxiv.org/pdf/2308.14486v1.pdf","comment":"Accepted for publication at ACM CIKM 2023"},{"id":"http://arxiv.org/abs/2303.10058v2","updated":"2023-08-28T10:46:22Z","published":"2023-03-17T15:38:39Z","title":"No Fear of Classifier Biases: Neural Collapse Inspired Federated\n Learning with Synthetic and Fixed Classifier","summary":" Data heterogeneity is an inherent challenge that hinders the performance of\nfederated learning (FL). Recent studies have identified the biased classifiers\nof local models as the key bottleneck. Previous attempts have used classifier\ncalibration after FL training, but this approach falls short in improving the\npoor feature representations caused by training-time classifier biases.\nResolving the classifier bias dilemma in FL requires a full understanding of\nthe mechanisms behind the classifier. Recent advances in neural collapse have\nshown that the classifiers and feature prototypes under perfect training\nscenarios collapse into an optimal structure called simplex equiangular tight\nframe (ETF). Building on this neural collapse insight, we propose a solution to\nthe FL's classifier bias problem by utilizing a synthetic and fixed ETF\nclassifier during training. The optimal classifier structure enables all\nclients to learn unified and optimal feature representations even under\nextremely heterogeneous data. We devise several effective modules to better\nadapt the ETF structure in FL, achieving both high generalization and\npersonalization. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet.\n","authors":["Zexi Li","Xinyi Shang","Rui He","Tao Lin","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2303.10058v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14481v1","updated":"2023-08-28T10:43:53Z","published":"2023-08-28T10:43:53Z","title":"Group Regression for Query Based Object Detection and Tracking","summary":" Group regression is commonly used in 3D object detection to predict box\nparameters of similar classes in a joint head, aiming to benefit from\nsimilarities while separating highly dissimilar classes. For query-based\nperception methods, this has, so far, not been feasible. We close this gap and\npresent a method to incorporate multi-class group regression, especially\ndesigned for the 3D domain in the context of autonomous driving, into existing\nattention and query-based perception approaches. We enhance a transformer based\njoint object detection and tracking model with this approach, and thoroughly\nevaluate its behavior and performance. For group regression, the classes of the\nnuScenes dataset are divided into six groups of similar shape and prevalence,\neach being regressed by a dedicated head. We show that the proposed method is\napplicable to many existing transformer based perception approaches and can\nbring potential benefits. The behavior of query group regression is thoroughly\nanalyzed in comparison to a unified regression head, e.g. in terms of\nclass-switching behavior and distribution of the output parameters. The\nproposed method offers many possibilities for further research, such as in the\ndirection of deep multi-hypotheses tracking.\n","authors":["Felicia Ruppel","Florian Faion","Claudius Gläser","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2308.14481v1.pdf","comment":"Accepted for publication at the 2023 26th IEEE International\n Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28,\n 2023, in Bilbao, Spain"},{"id":"http://arxiv.org/abs/2308.14478v1","updated":"2023-08-28T10:35:04Z","published":"2023-08-28T10:35:04Z","title":"Some issues in robust clustering","summary":" Some key issues in robust clustering are discussed with focus on Gaussian\nmixture model based clustering, namely the formal definition of outliers,\nambiguity between groups of outliers and clusters, the interaction between\nrobust clustering and the estimation of the number of clusters, the essential\ndependence of (not only) robust clustering on tuning decisions, and\nshortcomings of existing measurements of cluster stability when it comes to\noutliers.\n","authors":["Christian Hennig"],"pdf_url":"https://arxiv.org/pdf/2308.14478v1.pdf","comment":"11 pages, no figures"},{"id":"http://arxiv.org/abs/2212.07524v3","updated":"2023-08-28T10:06:41Z","published":"2022-12-14T22:12:32Z","title":"Invariant Lipschitz Bandits: A Side Observation Approach","summary":" Symmetry arises in many optimization and decision-making problems, and has\nattracted considerable attention from the optimization community: By utilizing\nthe existence of such symmetries, the process of searching for optimal\nsolutions can be improved significantly. Despite its success in (offline)\noptimization, the utilization of symmetries has not been well examined within\nthe online optimization settings, especially in the bandit literature. As such,\nin this paper we study the invariant Lipschitz bandit setting, a subclass of\nthe Lipschitz bandits where the reward function and the set of arms are\npreserved under a group of transformations. We introduce an algorithm named\n\\texttt{UniformMesh-N}, which naturally integrates side observations using\ngroup orbits into the \\texttt{UniformMesh} algorithm\n(\\cite{Kleinberg2005_UniformMesh}), which uniformly discretizes the set of\narms. Using the side-observation approach, we prove an improved regret upper\nbound, which depends on the cardinality of the group, given that the group is\nfinite. We also prove a matching regret's lower bound for the invariant\nLipschitz bandit class (up to logarithmic factors). We hope that our work will\nignite further investigation of symmetry in bandit theory and sequential\ndecision-making theory in general.\n","authors":["Nam Phuong Tran","Long Tran-Thanh"],"pdf_url":"https://arxiv.org/pdf/2212.07524v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14456v1","updated":"2023-08-28T09:49:48Z","published":"2023-08-28T09:49:48Z","title":"Speech Self-Supervised Representations Benchmarking: a Case for Larger\n Probing Heads","summary":" Self-supervised learning (SSL) leverages large datasets of unlabeled speech\nto reach impressive performance with reduced amounts of annotated data. The\nhigh number of proposed approaches fostered the emergence of comprehensive\nbenchmarks that evaluate their performance on a set of downstream tasks\nexploring various aspects of the speech signal. However, while the number of\nconsidered tasks has been growing, most proposals rely upon a single downstream\narchitecture that maps the frozen SSL representations to the task labels. This\nstudy examines how benchmarking results are affected by changes in the probing\nhead architecture. Interestingly, we found that altering the downstream\narchitecture structure leads to significant fluctuations in the performance\nranking of the evaluated models. Against common practices in speech SSL\nbenchmarking, we evaluate larger-capacity probing heads, showing their impact\non performance, inference costs, generalization and multi-level feature\nexploitation.\n","authors":["Salah Zaiem","Youcef Kemiche","Titouan Parcollet","Slim Essid","Mirco Ravanelli"],"pdf_url":"https://arxiv.org/pdf/2308.14456v1.pdf","comment":"11 Pages"},{"id":"http://arxiv.org/abs/2304.14824v2","updated":"2023-08-28T09:14:34Z","published":"2023-04-28T13:06:14Z","title":"A noise-robust acoustic method for recognizing foraging activities of\n grazing cattle","summary":" Farmers must continuously improve their livestock production systems to\nremain competitive in the growing dairy market. Precision livestock farming\ntechnologies provide individualized monitoring of animals on commercial farms,\noptimizing livestock production. Continuous acoustic monitoring is a widely\naccepted sensing technique used to estimate the daily rumination and grazing\ntime budget of free-ranging cattle. However, typical environmental and natural\nnoises on pastures noticeably affect the performance limiting the practical\napplication of current acoustic methods. In this study, we present the\noperating principle and generalization capability of an acoustic method called\nNoise-Robust Foraging Activity Recognizer (NRFAR). The proposed method\ndetermines foraging activity bouts by analyzing fixed-length segments of\nidentified jaw movement events produced during grazing and rumination. The\nadditive noise robustness of the NRFAR was evaluated for several\nsignal-to-noise ratios using stationary Gaussian white noise and four different\nnonstationary natural noise sources. In noiseless conditions, NRFAR reached an\naverage balanced accuracy of 86.4%, outperforming two previous acoustic methods\nby more than 7.5%. Furthermore, NRFAR performed better than previous acoustic\nmethods in 77 of 80 evaluated noisy scenarios (53 cases with p<0.05). NRFAR has\nbeen shown to be effective in harsh free-ranging environments and could be used\nas a reliable solution to improve pasture management and monitor the health and\nwelfare of dairy cows. The instrumentation and computational algorithms\npresented in this publication are protected by a pending patent application: AR\nP20220100910. Web demo available at: https://sinc.unl.edu.ar/web-demo/nrfar\n","authors":["Luciano S. Martinez-Rau","José O. Chelotti","Mariano Ferrero","Julio R. Galli","Santiago A. Utsumi","Alejandra M. Planisich","H. Leonardo Rufiner","Leonardo L. Giovanini"],"pdf_url":"https://arxiv.org/pdf/2304.14824v2.pdf","comment":"list of used audio-clips is available in the list_audio_clips.xlsx"},{"id":"http://arxiv.org/abs/2308.14430v1","updated":"2023-08-28T09:06:32Z","published":"2023-08-28T09:06:32Z","title":"TextrolSpeech: A Text Style Control Speech Corpus With Codec Language\n Text-to-Speech Models","summary":" Recently, there has been a growing interest in the field of controllable\nText-to-Speech (TTS). While previous studies have relied on users providing\nspecific style factor values based on acoustic knowledge or selecting reference\nspeeches that meet certain requirements, generating speech solely from natural\ntext prompts has emerged as a new challenge for researchers. This challenge\narises due to the scarcity of high-quality speech datasets with natural text\nstyle prompt and the absence of advanced text-controllable TTS models. In light\nof this, 1) we propose TextrolSpeech, which is the first large-scale speech\nemotion dataset annotated with rich text attributes. The dataset comprises\n236,220 pairs of style prompt in natural text descriptions with five style\nfactors and corresponding speech samples. Through iterative experimentation, we\nintroduce a multi-stage prompt programming approach that effectively utilizes\nthe GPT model for generating natural style descriptions in large volumes. 2)\nFurthermore, to address the need for generating audio with greater style\ndiversity, we propose an efficient architecture called Salle. This architecture\ntreats text controllable TTS as a language model task, utilizing audio codec\ncodes as an intermediate representation to replace the conventional\nmel-spectrogram. Finally, we successfully demonstrate the ability of the\nproposed model by showing a comparable performance in the controllable TTS\ntask. Audio samples are available at https://sall-e.github.io/\n","authors":["Shengpeng Ji","Jialong Zuo","Minghui Fang","Ziyue Jiang","Feiyang Chen","Xinyu Duan","Baoxing Huai","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14424v1","updated":"2023-08-28T09:04:52Z","published":"2023-08-28T09:04:52Z","title":"Shielded Reinforcement Learning for Hybrid Systems","summary":" Safe and optimal controller synthesis for switched-controlled hybrid systems,\nwhich combine differential equations and discrete changes of the system's\nstate, is known to be intricately hard. Reinforcement learning has been\nleveraged to construct near-optimal controllers, but their behavior is not\nguaranteed to be safe, even when it is encouraged by reward engineering. One\nway of imposing safety to a learned controller is to use a shield, which is\ncorrect by design. However, obtaining a shield for non-linear and hybrid\nenvironments is itself intractable. In this paper, we propose the construction\nof a shield using the so-called barbaric method, where an approximate finite\nrepresentation of an underlying partition-based two-player safety game is\nextracted via systematically picked samples of the true transition function.\nWhile hard safety guarantees are out of reach, we experimentally demonstrate\nstrong statistical safety guarantees with a prototype implementation and UPPAAL\nSTRATEGO. Furthermore, we study the impact of the synthesized shield when\napplied as either a pre-shield (applied before learning a controller) or a\npost-shield (only applied after learning a controller). We experimentally\ndemonstrate superiority of the pre-shielding approach. We apply our technique\non a range of case studies, including two industrial examples, and further\nstudy post-optimization of the post-shielding approach.\n","authors":["Asger Horn Brorholt","Peter Gjøl Jensen","Kim Guldstrand Larsen","Florian Lorber","Christian Schilling"],"pdf_url":"https://arxiv.org/pdf/2308.14424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11796v2","updated":"2023-08-28T09:00:50Z","published":"2022-10-21T08:19:45Z","title":"Differentiable Constrained Imitation Learning for Robot Motion Planning\n and Control","summary":" Motion planning and control are crucial components of robotics applications\nlike automated driving. Here, spatio-temporal hard constraints like system\ndynamics and safety boundaries (e.g., obstacles) restrict the robot's motions.\nDirect methods from optimal control solve a constrained optimization problem.\nHowever, in many applications finding a proper cost function is inherently\ndifficult because of the weighting of partially conflicting objectives. On the\nother hand, Imitation Learning (IL) methods such as Behavior Cloning (BC)\nprovide an intuitive framework for learning decision-making from offline\ndemonstrations and constitute a promising avenue for planning and control in\ncomplex robot applications. Prior work primarily relied on soft constraint\napproaches, which use additional auxiliary loss terms describing the\nconstraints. However, catastrophic safety-critical failures might occur in\nout-of-distribution (OOD) scenarios. This work integrates the flexibility of IL\nwith hard constraint handling in optimal control. Our approach constitutes a\ngeneral framework for constraint robotic motion planning and control, as well\nas traffic agent simulation, whereas we focus on mobile robot and automated\ndriving applications. Hard constraints are integrated into the learning problem\nin a differentiable manner, via explicit completion and gradient-based\ncorrection. Simulated experiments of mobile robot navigation and automated\ndriving provide evidence for the performance of the proposed method.\n","authors":["Christopher Diehl","Janis Adamek","Martin Krüger","Frank Hoffmann","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2210.11796v2.pdf","comment":"International Conference on Intelligent Robots and Systems Agents4AD\n Workshop, IROS 2023"},{"id":"http://arxiv.org/abs/2210.13533v2","updated":"2023-08-28T08:58:18Z","published":"2022-10-24T18:34:24Z","title":"Sufficient Invariant Learning for Distribution Shift","summary":" Machine learning algorithms have shown remarkable performance in diverse\napplications. However, it is still challenging to guarantee performance in\ndistribution shifts when distributions of training and test datasets are\ndifferent. There have been several approaches to improve the performance in\ndistribution shift cases by learning invariant features across groups or\ndomains. However, we observe that the previous works only learn invariant\nfeatures partially. While the prior works focus on the limited invariant\nfeatures, we first raise the importance of the sufficient invariant features.\nSince only training sets are given empirically, the learned partial invariant\nfeatures from training sets might not be present in the test sets under\ndistribution shift. Therefore, the performance improvement on distribution\nshifts might be limited. In this paper, we argue that learning sufficient\ninvariant features from the training set is crucial for the distribution shift\ncase. Concretely, we newly observe the connection between a) sufficient\ninvariant features and b) flatness differences between groups or domains.\nMoreover, we propose a new algorithm, Adaptive Sharpness-aware Group\nDistributionally Robust Optimization (ASGDRO), to learn sufficient invariant\nfeatures across domains or groups. ASGDRO learns sufficient invariant features\nby seeking common flat minima across all groups or domains. Therefore, ASGDRO\nimproves the performance on diverse distribution shift cases. Besides, we\nprovide a new simple dataset, Heterogeneous-CMNIST, to diagnose whether the\nvarious algorithms learn sufficient invariant features.\n","authors":["Taero Kim","Sungjun Lim","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2210.13533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12016v2","updated":"2023-08-28T08:51:56Z","published":"2023-08-23T09:18:41Z","title":"MKL-$L_{0/1}$-SVM","summary":" This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework\nfor the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some\nKKT-like first-order optimality conditions are provided and then exploited to\ndevelop a fast ADMM algorithm to solve the nonsmooth nonconvex optimization\nproblem. Numerical experiments on synthetic and real datasets show that the\nperformance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading\napproaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and\nGrandvalet [Journal of Machine Learning Research, vol.~9, pp.~2491--2521,\n2008].\n","authors":["Bin Zhu","Yijie Shi"],"pdf_url":"https://arxiv.org/pdf/2308.12016v2.pdf","comment":"26 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin\n note: substantial text overlap with arXiv:2303.04445"},{"id":"http://arxiv.org/abs/2308.14412v1","updated":"2023-08-28T08:50:12Z","published":"2023-08-28T08:50:12Z","title":"Task-Aware Machine Unlearning and Its Application in Load Forecasting","summary":" Data privacy and security have become a non-negligible factor in load\nforecasting. Previous researches mainly focus on training stage enhancement.\nHowever, once the model is trained and deployed, it may need to `forget' (i.e.,\nremove the impact of) part of training data if the data is found to be\nmalicious or as requested by the data owner. This paper introduces machine\nunlearning algorithm which is specifically designed to remove the influence of\npart of the original dataset on an already trained forecaster. However, direct\nunlearning inevitably degrades the model generalization ability. To balance\nbetween unlearning completeness and performance degradation, a\nperformance-aware algorithm is proposed by evaluating the sensitivity of local\nmodel parameter change using influence function and sample re-weighting.\nMoreover, we observe that the statistic criterion cannot fully reflect the\noperation cost of down-stream tasks. Therefore, a task-aware machine unlearning\nis proposed whose objective is a tri-level optimization with dispatch and\nredispatch problems considered. We theoretically prove the existence of the\ngradient of such objective, which is key to re-weighting the remaining samples.\nWe test the unlearning algorithms on linear and neural network load forecasters\nwith realistic load dataset. The simulation demonstrates the balance on\nunlearning completeness and operational cost. All codes can be found at\nhttps://github.com/xuwkk/task_aware_machine_unlearning.\n","authors":["Wangkun Xu","Fei Teng"],"pdf_url":"https://arxiv.org/pdf/2308.14412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14409v1","updated":"2023-08-28T08:47:06Z","published":"2023-08-28T08:47:06Z","title":"Steerable Conditional Diffusion for Out-of-Distribution Adaptation in\n Imaging Inverse Problems","summary":" Denoising diffusion models have emerged as the go-to framework for solving\ninverse problems in imaging. A critical concern regarding these models is their\nperformance on out-of-distribution (OOD) tasks, which remains an under-explored\nchallenge. Realistic reconstructions inconsistent with the measured data can be\ngenerated, hallucinating image features that are uniquely present in the\ntraining dataset. To simultaneously enforce data-consistency and leverage\ndata-driven priors, we introduce a novel sampling framework called Steerable\nConditional Diffusion. This framework adapts the denoising network specifically\nto the available measured data. Utilising our proposed method, we achieve\nsubstantial enhancements in OOD performance across diverse imaging modalities,\nadvancing the robust deployment of denoising diffusion models in real-world\napplications.\n","authors":["Riccardo Barbano","Alexander Denker","Hyungjin Chung","Tae Hoon Roh","Simon Arrdige","Peter Maass","Bangti Jin","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2308.14409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14407v1","updated":"2023-08-28T08:42:06Z","published":"2023-08-28T08:42:06Z","title":"Identifying topology of leaky photonic lattices with machine learning","summary":" We show how machine learning techniques can be applied for the classification\nof topological phases in leaky photonic lattices using limited measurement\ndata. We propose an approach based solely on bulk intensity measurements, thus\nexempt from the need for complicated phase retrieval procedures. In particular,\nwe design a fully connected neural network that accurately determines\ntopological properties from the output intensity distribution in dimerized\nwaveguide arrays with leaky channels, after propagation of a spatially\nlocalized initial excitation at a finite distance, in a setting that closely\nemulates realistic experimental conditions.\n","authors":["Ekaterina O. Smolina","Lev A. Smirnov","Daniel Leykam","Franco Nori","Daria A. Smirnova"],"pdf_url":"https://arxiv.org/pdf/2308.14407v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14400v1","updated":"2023-08-28T08:33:45Z","published":"2023-08-28T08:33:45Z","title":"Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer\n and NearFarMix Augmentation","summary":" In computer vision, depth estimation is crucial for domains like robotics,\nautonomous vehicles, augmented reality, and virtual reality. Integrating\nsemantics with depth enhances scene understanding through reciprocal\ninformation sharing. However, the scarcity of semantic information in datasets\nposes challenges. Existing convolutional approaches with limited local\nreceptive fields hinder the full utilization of the symbiotic potential between\ndepth and semantics. This paper introduces a dataset-invariant semi-supervised\nstrategy to address the scarcity of semantic information. It proposes the Depth\nSemantics Symbiosis module, leveraging the Symbiotic Transformer for achieving\ncomprehensive mutual awareness by information exchange within both local and\nglobal contexts. Additionally, a novel augmentation, NearFarMix is introduced\nto combat overfitting and compensate both depth-semantic tasks by strategically\nmerging regions from two images, generating diverse and structurally consistent\nsamples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI\ndatasets demonstrate the superiority of our proposed techniques in indoor and\noutdoor environments.\n","authors":["Md Awsafur Rahman","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2308.14400v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2202.06599v3","updated":"2023-08-28T08:27:30Z","published":"2022-02-14T10:40:51Z","title":"Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in\n First Trimester 3D Ultrasound","summary":" Segmentation and spatial alignment of ultrasound (US) imaging data acquired\nin the in first trimester are crucial for monitoring human embryonic growth and\ndevelopment throughout this crucial period of life. Current approaches are\neither manual or semi-automatic and are therefore very time-consuming and prone\nto errors. To automate these tasks, we propose a multi-atlas framework for\nautomatic segmentation and spatial alignment of the embryo using deep learning\nwith minimal supervision. Our framework learns to register the embryo to an\natlas, which consists of the US images acquired at a range of gestational age\n(GA), segmented and spatially aligned to a predefined standard orientation.\nFrom this, we can derive the segmentation of the embryo and put the embryo in\nstandard orientation. US images acquired at 8+0 till 12+6 weeks GA were used\nand eight subjects were selected as atlas. We evaluated different fusion\nstrategies to incorporate multiple atlases: 1) training the framework using\natlas images from a single subject, 2) training the framework with data of all\navailable atlases and 3) ensembling of the frameworks trained per subject. To\nevaluate the performance, we calculated the Dice score over the test set. We\nfound that training the framework using all available atlases outperformed\nensembling and gave similar results compared to the best of all frameworks\ntrained on a single subject. Furthermore, we found that selecting images from\nthe four atlases closest in GA out of all available atlases, regardless of the\nindividual quality, gave the best results with a median Dice score of 0.72. We\nconclude that our framework can accurately segment and spatially align the\nembryo in first trimester 3D US images and is robust for the variation in\nquality that existed in the available atlases.\n","authors":["W. A. P. Bastiaansen","M. Rousian","R. P. M. Steegers-Theunissen","W. J. Niessen","A. H. J. Koning","S. Klein"],"pdf_url":"https://arxiv.org/pdf/2202.06599v3.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html"},{"id":"http://arxiv.org/abs/2308.13269v2","updated":"2023-08-28T08:09:52Z","published":"2023-08-25T09:42:54Z","title":"Heterogeneous Decentralized Machine Unlearning with Seed Model\n Distillation","summary":" As some recent information security legislation endowed users with\nunconditional rights to be forgotten by any trained machine learning model,\npersonalized IoT service providers have to put unlearning functionality into\ntheir consideration. The most straightforward method to unlearn users'\ncontribution is to retrain the model from the initial state, which is not\nrealistic in high throughput applications with frequent unlearning requests.\nThough some machine unlearning frameworks have been proposed to speed up the\nretraining process, they fail to match decentralized learning scenarios. In\nthis paper, we design a decentralized unlearning framework called HDUS, which\nuses distilled seed models to construct erasable ensembles for all clients.\nMoreover, the framework is compatible with heterogeneous on-device models,\nrepresenting stronger scalability in real-world applications. Extensive\nexperiments on three real-world datasets show that our HDUS achieves\nstate-of-the-art performance.\n","authors":["Guanhua Ye","Tong Chen","Quoc Viet Hung Nguyen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2308.13269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14388v1","updated":"2023-08-28T08:07:57Z","published":"2023-08-28T08:07:57Z","title":"Biclustering Methods via Sparse Penalty","summary":" In this paper, we first reviewed several biclustering methods that are used\nto identify the most significant clusters in gene expression data. Here we\nmainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty\nnamed \"Prenet penalty\" which has been used only in factor analysis to gain\nsparsity. Then in the simulation study, we tried different types of generated\ndatasets (with different sparsity and dimension) and tried 1-layer\napproximation then for k-layers which shows the mixed Prenet penalty is very\neffective for non-overlapped data. Finally, we used some real gene expression\ndata to show the behavior of our methods.\n","authors":["Jiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04391v5","updated":"2023-08-28T08:02:47Z","published":"2023-02-09T01:09:57Z","title":"The Re-Label Method For Data-Centric Machine Learning","summary":" In industry deep learning application, our manually labeled data has a\ncertain number of noisy data. To solve this problem and achieve more than 90\nscore in dev dataset, we present a simple method to find the noisy data and\nre-label the noisy data by human, given the model predictions as references in\nhuman labeling. In this paper, we illustrate our idea for a broad set of deep\nlearning tasks, includes classification, sequence tagging, object detection,\nsequence generation, click-through rate prediction. The experimental results\nand human evaluation results verify our idea.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2302.04391v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14380v1","updated":"2023-08-28T07:55:01Z","published":"2023-08-28T07:55:01Z","title":"Self-Supervision for Tackling Unsupervised Anomaly Detection: Pitfalls\n and Opportunities","summary":" Self-supervised learning (SSL) is a growing torrent that has recently\ntransformed machine learning and its many real world applications, by learning\non massive amounts of unlabeled data via self-generated supervisory signals.\nUnsupervised anomaly detection (AD) has also capitalized on SSL, by\nself-generating pseudo-anomalies through various data augmentation functions or\nexternal data exposure. In this vision paper, we first underline the importance\nof the choice of SSL strategies on AD performance, by presenting evidences and\nstudies from the AD literature. Equipped with the understanding that SSL incurs\nvarious hyperparameters (HPs) to carefully tune, we present recent developments\non unsupervised model selection and augmentation tuning for SSL-based AD. We\nthen highlight emerging challenges and future opportunities; on designing new\npretext tasks and augmentation functions for different data modalities,\ncreating novel model selection solutions for systematically tuning the SSL HPs,\nas well as on capitalizing on the potential of pretrained foundation models on\nAD through effective density estimation.\n","authors":["Leman Akoglu","Jaemin Yoo"],"pdf_url":"https://arxiv.org/pdf/2308.14380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14377v1","updated":"2023-08-28T07:49:30Z","published":"2023-08-28T07:49:30Z","title":"Meta Attentive Graph Convolutional Recurrent Network for Traffic\n Forecasting","summary":" Traffic forecasting is a fundamental problem in intelligent transportation\nsystems. Existing traffic predictors are limited by their expressive power to\nmodel the complex spatial-temporal dependencies in traffic data, mainly due to\nthe following limitations. Firstly, most approaches are primarily designed to\nmodel the local shared patterns, which makes them insufficient to capture the\nspecific patterns associated with each node globally. Hence, they fail to learn\neach node's unique properties and diversified patterns. Secondly, most existing\napproaches struggle to accurately model both short- and long-term dependencies\nsimultaneously. In this paper, we propose a novel traffic predictor, named Meta\nAttentive Graph Convolutional Recurrent Network (MAGCRN). MAGCRN utilizes a\nGraph Convolutional Recurrent Network (GCRN) as a core module to model local\ndependencies and improves its operation with two novel modules: 1) a\nNode-Specific Meta Pattern Learning (NMPL) module to capture node-specific\npatterns globally and 2) a Node Attention Weight Generation Module (NAWG)\nmodule to capture short- and long-term dependencies by connecting the\nnode-specific features with the ones learned initially at each time step during\nGCRN operation. Experiments on six real-world traffic datasets demonstrate that\nNMPL and NAWG together enable MAGCRN to outperform state-of-the-art baselines\non both short- and long-term predictions.\n","authors":["Adnan Zeb","Yongchao Ye","Shiyao Zhang","James J. Q. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14376v1","updated":"2023-08-28T07:49:01Z","published":"2023-08-28T07:49:01Z","title":"Are Existing Out-Of-Distribution Techniques Suitable for Network\n Intrusion Detection?","summary":" Machine learning (ML) has become increasingly popular in network intrusion\ndetection. However, ML-based solutions always respond regardless of whether the\ninput data reflects known patterns, a common issue across safety-critical\napplications. While several proposals exist for detecting Out-Of-Distribution\n(OOD) in other fields, it remains unclear whether these approaches can\neffectively identify new forms of intrusions for network security. New attacks,\nnot necessarily affecting overall distributions, are not guaranteed to be\nclearly OOD as instead, images depicting new classes are in computer vision. In\nthis work, we investigate whether existing OOD detectors from other fields\nallow the identification of unknown malicious traffic. We also explore whether\nmore discriminative and semantically richer embedding spaces within models,\nsuch as those created with contrastive learning and multi-class tasks, benefit\ndetection. Our investigation covers a set of six OOD techniques that employ\ndifferent detection strategies. These techniques are applied to models trained\nin various ways and subsequently exposed to unknown malicious traffic from the\nsame and different datasets (network environments). Our findings suggest that\nexisting detectors can identify a consistent portion of new malicious traffic,\nand that improved embedding spaces enhance detection. We also demonstrate that\nsimple combinations of certain detectors can identify almost 100% of malicious\ntraffic in our tested scenarios.\n","authors":["Andrea Corsini","Shanchieh Jay Yang"],"pdf_url":"https://arxiv.org/pdf/2308.14376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14374v1","updated":"2023-08-28T07:42:26Z","published":"2023-08-28T07:42:26Z","title":"Online Continual Learning on Hierarchical Label Expansion","summary":" Continual learning (CL) enables models to adapt to new tasks and environments\nwithout forgetting previously learned knowledge. While current CL setups have\nignored the relationship between labels in the past task and the new task with\nor without small task overlaps, real-world scenarios often involve hierarchical\nrelationships between old and new tasks, posing another challenge for\ntraditional CL approaches. To address this challenge, we propose a novel\nmulti-level hierarchical class incremental task configuration with an online\nlearning constraint, called hierarchical label expansion (HLE). Our\nconfiguration allows a network to first learn coarse-grained classes, with data\nlabels continually expanding to more fine-grained classes in various hierarchy\ndepths. To tackle this new setup, we propose a rehearsal-based method that\nutilizes hierarchy-aware pseudo-labeling to incorporate hierarchical class\ninformation. Additionally, we propose a simple yet effective memory management\nand sampling strategy that selectively adopts samples of newly encountered\nclasses. Our experiments demonstrate that our proposed method can effectively\nuse hierarchy on our HLE setup to improve classification accuracy across all\nlevels of hierarchies, regardless of depth and class imbalance ratio,\noutperforming prior state-of-the-art works by significant margins while also\noutperforming them on the conventional disjoint, blurry and i-Blurry CL setups.\n","authors":["Byung Hyun Lee","Okchul Jung","Jonghyun Choi","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2308.14374v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2102.00877v2","updated":"2023-08-28T07:41:55Z","published":"2021-02-01T14:36:34Z","title":"A probabilistic Taylor expansion with Gaussian processes","summary":" We study a class of Gaussian processes for which the posterior mean, for a\nparticular choice of data, replicates a truncated Taylor expansion of any\norder. The data consist of derivative evaluations at the expansion point and\nthe prior covariance kernel belongs to the class of Taylor kernels, which can\nbe written in a certain power series form. We discuss and prove some results on\nmaximum likelihood estimation of parameters of Taylor kernels. The proposed\nframework is a special case of Gaussian process regression based on data that\nis orthogonal in the reproducing kernel Hilbert space of the covariance kernel.\n","authors":["Toni Karvonen","Jon Cockayne","Filip Tronarp","Simo Särkkä"],"pdf_url":"https://arxiv.org/pdf/2102.00877v2.pdf","comment":"To appear in Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2210.00637v4","updated":"2023-08-28T07:41:52Z","published":"2022-10-02T21:36:27Z","title":"Benign Autoencoders","summary":" Recent progress in Generative Artificial Intelligence (AI) relies on\nefficient data representations, often featuring encoder-decoder architectures.\nWe formalize the mathematical problem of finding the optimal encoder-decoder\npair and characterize its solution, which we name the \"benign autoencoder\"\n(BAE). We prove that BAE projects data onto a manifold whose dimension is the\noptimal compressibility dimension of the generative problem. We highlight\nsurprising connections between BAE and several recent developments in AI, such\nas conditional GANs, context encoders, stable diffusion, stacked autoencoders,\nand the learning capabilities of generative models. As an illustration, we show\nhow BAE can find optimal, low-dimensional latent representations that improve\nthe performance of a discriminator under a distribution shift. By compressing\n\"malignant\" data dimensions, BAE leads to smoother and more stable gradients.\n","authors":["Semyon Malamud","Teng Andrea Xu","Antoine Didisheim"],"pdf_url":"https://arxiv.org/pdf/2210.00637v4.pdf","comment":"This paper replaces and subsumes arXiv:2110.08884"},{"id":"http://arxiv.org/abs/2308.09729v3","updated":"2023-08-28T07:37:36Z","published":"2023-08-17T16:59:50Z","title":"MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large\n Language Models","summary":" LLMs usually exhibit limitations in their ability to incorporate new\nknowledge, the generation of hallucinations, and the transparency of their\ndecision-making process. In this paper, we explore how to prompt LLMs with\nknowledge graphs (KG), working as a remedy to engage LLMs with up-to-date\nknowledge and elicit the reasoning pathways from LLMs. Specifically, we build a\nprompting pipeline that endows LLMs with the capability of comprehending KG\ninputs and inferring with a combined implicit knowledge and the retrieved\nexternal knowledge. In addition, we investigate eliciting the mind map on which\nLLMs perform the reasoning and generate the answers. It is identified that the\nproduced mind map exhibits the reasoning pathways of LLMs grounded on the\nontology of knowledge, hence bringing the prospects of probing and gauging LLM\ninference in production. The experiments on three question & answering datasets\nalso show that MindMap prompting leads to a striking empirical gain. For\ninstance, prompting a GPT-3.5 with MindMap yields an overwhelming performance\nover GPT-4 consistently. We also demonstrate that with structured facts\nretrieved from KG, MindMap can outperform a series of\nprompting-with-document-retrieval methods, benefiting from more accurate,\nconcise, and comprehensive knowledge from KGs.\n","authors":["Yilin Wen","Zifeng Wang","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09729v3.pdf","comment":"7 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2209.14013v3","updated":"2023-08-28T07:32:23Z","published":"2022-09-28T11:41:38Z","title":"On the Robustness of Random Forest Against Untargeted Data Poisoning: An\n Ensemble-Based Approach","summary":" Machine learning is becoming ubiquitous. From finance to medicine, machine\nlearning models are boosting decision-making processes and even outperforming\nhumans in some tasks. This huge progress in terms of prediction quality does\nnot however find a counterpart in the security of such models and corresponding\npredictions, where perturbations of fractions of the training set (poisoning)\ncan seriously undermine the model accuracy. Research on poisoning attacks and\ndefenses received increasing attention in the last decade, leading to several\npromising solutions aiming to increase the robustness of machine learning.\nAmong them, ensemble-based defenses, where different models are trained on\nportions of the training set and their predictions are then aggregated, provide\nstrong theoretical guarantees at the price of a linear overhead. Surprisingly,\nensemble-based defenses, which do not pose any restrictions on the base model,\nhave not been applied to increase the robustness of random forest models. The\nwork in this paper aims to fill in this gap by designing and implementing a\nnovel hash-based ensemble approach that protects random forest against\nuntargeted, random poisoning attacks. An extensive experimental evaluation\nmeasures the performance of our approach against a variety of attacks, as well\nas its sustainability in terms of resource consumption and performance, and\ncompares it with a traditional monolithic model based on random forest. A final\ndiscussion presents our main findings and compares our approach with existing\npoisoning defenses targeting random forests.\n","authors":["Marco Anisetti","Claudio A. Ardagna","Alessandro Balestrucci","Nicola Bena","Ernesto Damiani","Chan Yeob Yeun"],"pdf_url":"https://arxiv.org/pdf/2209.14013v3.pdf","comment":"Accepted in IEEE Transactions on Sustainable Computing; 15 pages, 8\n figures"},{"id":"http://arxiv.org/abs/2302.02092v3","updated":"2023-08-28T07:25:10Z","published":"2023-02-04T04:52:22Z","title":"Interpolation for Robust Learning: Data Augmentation on Wasserstein\n Geodesics","summary":" We propose to study and promote the robustness of a model as per its\nperformance through the interpolation of training data distributions.\nSpecifically, (1) we augment the data by finding the worst-case Wasserstein\nbarycenter on the geodesic connecting subpopulation distributions of different\ncategories. (2) We regularize the model for smoother performance on the\ncontinuous geodesic path connecting subpopulation distributions. (3)\nAdditionally, we provide a theoretical guarantee of robustness improvement and\ninvestigate how the geodesic location and the sample size contribute,\nrespectively. Experimental validations of the proposed strategy on\n\\textit{four} datasets, including CIFAR-100 and ImageNet, establish the\nefficacy of our method, e.g., our method improves the baselines' certifiable\nrobustness on CIFAR10 up to $7.7\\%$, with $16.8\\%$ on empirical robustness on\nCIFAR-100. Our work provides a new perspective of model robustness through the\nlens of Wasserstein geodesic-based interpolation with a practical off-the-shelf\nstrategy that can be combined with existing robust training methods.\n","authors":["Jiacheng Zhu","Jielin Qiu","Aritra Guha","Zhuolin Yang","Xuanlong Nguyen","Bo Li","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2302.02092v3.pdf","comment":"34 pages, 3 figures, 18 tables"},{"id":"http://arxiv.org/abs/2308.14364v1","updated":"2023-08-28T07:23:03Z","published":"2023-08-28T07:23:03Z","title":"Target-independent XLA optimization using Reinforcement Learning","summary":" An important challenge in Machine Learning compilers like XLA is multi-pass\noptimization and analysis. There has been recent interest chiefly in XLA\ntarget-dependent optimization on the graph-level, subgraph-level, and\nkernel-level phases. We specifically focus on target-independent optimization\nXLA HLO pass ordering: our approach aims at finding the optimal sequence of\ncompiler optimization passes, which is decoupled from target-dependent\noptimization. However, there is little domain specific study in pass ordering\nfor XLA HLO. To this end, we propose introducing deep Reinforcement Learning\n(RL) based search for optimal XLA HLO pass ordering. We also propose\nenhancements to the deep RL algorithms to further improve optimal search\nperformance and open the research direction for domain-specific guidance for\nRL. We create an XLA Gym experimentation framework as a tool to enable RL\nalgorithms to interact with the compiler for passing optimizations and thereby\ntrain agents. Overall, in our experimentation we observe an average of $13.3\\%$\nimprovement in operation count reduction on a benchmark of GPT-2 training\ngraphs and $10.4\\%$ improvement on a diverse benchmark including GPT-2, BERT,\nand ResNet graphs using the proposed approach over the compiler's default phase\nordering.\n","authors":["Milan Ganai","Haichen Li","Theodore Enns","Yida Wang","Randy Huang"],"pdf_url":"https://arxiv.org/pdf/2308.14364v1.pdf","comment":"Workshop on ML for Systems @ NeurIPS 2022"},{"id":"http://arxiv.org/abs/2308.14355v1","updated":"2023-08-28T07:03:08Z","published":"2023-08-28T07:03:08Z","title":"Can Transformer and GNN Help Each Other?","summary":" Although Transformer has achieved great success in natural language process\nand computer vision, it has difficulty generalizing to medium and large-scale\ngraph data for two important reasons: (i) High complexity. (ii) Failing to\ncapture the complex and entangled structure information. In graph\nrepresentation learning, Graph Neural Networks(GNNs) can fuse the graph\nstructure and node attributes but have limited receptive fields. Therefore, we\nquestion whether can we combine Transformers and GNNs to help each other. In\nthis paper, we propose a new model named TransGNN where the Transformer layer\nand GNN layer are used alternately to improve each other. Specifically, to\nexpand the receptive field and disentangle the information aggregation from\nedges, we propose using Transformer to aggregate more relevant nodes'\ninformation to improve the message passing of GNNs. Besides, to capture the\ngraph structure information, we utilize positional encoding and make use of the\nGNN layer to fuse the structure into node attributes, which improves the\nTransformer in graph data. We also propose to sample the most relevant nodes\nfor Transformer and two efficient samples update strategies to lower the\ncomplexity. At last, we theoretically prove that TransGNN is more expressive\nthan GNNs only with extra linear complexity. The experiments on eight datasets\ncorroborate the effectiveness of TransGNN on node and graph classification\ntasks.\n","authors":["Peiyan Zhang","Yuchen Yan","Chaozhuo Li","Senzhang Wang","Xing Xie","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14352v1","updated":"2023-08-28T06:56:08Z","published":"2023-08-28T06:56:08Z","title":"EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models","summary":" Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a\nrevolution in machine intelligence, owing to their exceptional capabilities in\na wide range of machine learning tasks. However, the transition of LLMs from\ndata centers to edge devices presents a set of challenges and opportunities.\nWhile this shift can enhance privacy and availability, it is hampered by the\nenormous parameter sizes of these models, leading to impractical runtime costs.\nIn light of these considerations, we introduce EdgeMoE, the first on-device\ninference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant\nof sparse LLMs that exhibit nearly constant computational complexity as their\nparameter size scales. EdgeMoE achieves both memory and computational\nefficiency by strategically partitioning the model across the storage\nhierarchy. Specifically, non-expert weights are stored in the device's memory,\nwhile expert weights are kept in external storage and are fetched into memory\nonly when they are activated. This design is underpinned by a crucial insight\nthat expert weights, though voluminous, are infrequently accessed due to sparse\nactivation patterns. To further mitigate the overhead associated with expert\nI/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise\nbitwidth adaptation: This method reduces the size of expert weights with an\nacceptable level of accuracy loss. (2) Expert management: It predicts the\nexperts that will be activated in advance and preloads them into the\ncompute-I/O pipeline, thus further optimizing the process. In empirical\nevaluations conducted on well-established MoE LLMs and various edge devices,\nEdgeMoE demonstrates substantial memory savings and performance improvements\nwhen compared to competitive baseline solutions.\n","authors":["Rongjie Yi","Liwei Guo","Shiyun Wei","Ao Zhou","Shangguang Wang","Mengwei Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12191v2","updated":"2023-08-28T06:54:12Z","published":"2022-12-23T08:20:37Z","title":"Deep Unfolding-based Weighted Averaging for Federated Learning in\n Heterogeneous Environments","summary":" Federated learning is a collaborative model training method that iterates\nmodel updates by multiple clients and aggregation of the updates by a central\nserver. Device and statistical heterogeneity of participating clients cause\nsignificant performance degradation so that an appropriate aggregation weight\nshould be assigned to each client in the aggregation phase of the server. To\nadjust the aggregation weights, this paper employs deep unfolding, which is\nknown as the parameter tuning method that leverages both learning capability\nusing training data like deep learning and domain knowledge. This enables us to\ndirectly incorporate the heterogeneity of the environment of interest into the\ntuning of the aggregation weights. The proposed approach can be combined with\nvarious federated learning algorithms. The results of numerical experiments\nindicate that a higher test accuracy for unknown class-balanced data can be\nobtained with the proposed method than that with conventional heuristic\nweighting methods. The proposed method can handle large-scale learning models\nwith the aid of pretrained models such that it can perform practical real-world\ntasks. Convergence rate of federated learning algorithms with the proposed\nmethod is also provided in this paper.\n","authors":["Ayano Nakai-Kasai","Tadashi Wadayama"],"pdf_url":"https://arxiv.org/pdf/2212.12191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14350v1","updated":"2023-08-28T06:53:31Z","published":"2023-08-28T06:53:31Z","title":"Simple Modification of the Upper Confidence Bound Algorithm by\n Generalized Weighted Averages","summary":" The multi-armed bandit (MAB) problem is a classical problem that models\nsequential decision-making under uncertainty in reinforcement learning. In this\nstudy, we propose a new generalized upper confidence bound (UCB) algorithm\n(GWA-UCB1) by extending UCB1, which is a representative algorithm for MAB\nproblems, using generalized weighted averages, and present an effective\nalgorithm for various problem settings. GWA-UCB1 is a two-parameter\ngeneralization of the balance between exploration and exploitation in UCB1 and\ncan be implemented with a simple modification of the UCB1 formula. Therefore,\nthis algorithm can be easily applied to UCB-based reinforcement learning\nmodels. In preliminary experiments, we investigated the optimal parameters of a\nsimple generalized UCB1 (G-UCB1), prepared for comparison and GWA-UCB1, in a\nstochastic MAB problem with two arms. Subsequently, we confirmed the\nperformance of the algorithms with the investigated parameters on stochastic\nMAB problems when arm reward probabilities were sampled from uniform or normal\ndistributions and on survival MAB problems assuming more realistic situations.\nGWA-UCB1 outperformed G-UCB1, UCB1-Tuned, and Thompson sampling in most problem\nsettings and can be useful in many situations. The code is available at\nhttps://github.com/manome/python-mab.\n","authors":["Nobuhito Manome","Shuji Shinohara","Ung-il Chung"],"pdf_url":"https://arxiv.org/pdf/2308.14350v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14348v1","updated":"2023-08-28T06:48:06Z","published":"2023-08-28T06:48:06Z","title":"Label-free Deep Learning Driven Secure Access Selection in\n Space-Air-Ground Integrated Networks","summary":" In Space-air-ground integrated networks (SAGIN), the inherent openness and\nextensive broadcast coverage expose these networks to significant eavesdropping\nthreats. Considering the inherent co-channel interference due to spectrum\nsharing among multi-tier access networks in SAGIN, it can be leveraged to\nassist the physical layer security among heterogeneous transmissions. However,\nit is challenging to conduct a secrecy-oriented access strategy due to both\nheterogeneous resources and different eavesdropping models. In this paper, we\nexplore secure access selection for a scenario involving multi-mode users\ncapable of accessing satellites, unmanned aerial vehicles, or base stations in\nthe presence of eavesdroppers. Particularly, we propose a Q-network\napproximation based deep learning approach for selecting the optimal access\nstrategy for maximizing the sum secrecy rate. Meanwhile, the power optimization\nis also carried out by an unsupervised learning approach to improve the secrecy\nperformance. Remarkably, two neural networks are trained by unsupervised\nlearning and Q-network approximation which are both label-free methods without\nknowing the optimal solution as labels. Numerical results verify the efficiency\nof our proposed power optimization approach and access strategy, leading to\nenhanced secure transmission performance.\n","authors":["Zhaowei Wang","Zhisheng Yin","Xiucheng Wang","Nan Cheng","Yuan Zhang","Tom H. Luan"],"pdf_url":"https://arxiv.org/pdf/2308.14348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14343v1","updated":"2023-08-28T06:40:02Z","published":"2023-08-28T06:40:02Z","title":"Buy when? Survival machine learning model comparison for purchase timing","summary":" The value of raw data is unlocked by converting it into information and\nknowledge that drives decision-making. Machine Learning (ML) algorithms are\ncapable of analysing large datasets and making accurate predictions. Market\nsegmentation, client lifetime value, and marketing techniques have all made use\nof machine learning. This article examines marketing machine learning\ntechniques such as Support Vector Machines, Genetic Algorithms, Deep Learning,\nand K-Means. ML is used to analyse consumer behaviour, propose items, and make\nother customer choices about whether or not to purchase a product or service,\nbut it is seldom used to predict when a person will buy a product or a basket\nof products. In this paper, the survival models Kernel SVM, DeepSurv, Survival\nRandom Forest, and MTLR are examined to predict tine-purchase individual\ndecisions. Gender, Income, Location, PurchaseHistory, OnlineBehavior,\nInterests, PromotionsDiscounts and CustomerExperience all have an influence on\npurchasing time, according to the analysis. The study shows that the DeepSurv\nmodel predicted purchase completion the best. These insights assist marketers\nin increasing conversion rates.\n","authors":["Diego Vallarino"],"pdf_url":"https://arxiv.org/pdf/2308.14343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14340v1","updated":"2023-08-28T06:32:09Z","published":"2023-08-28T06:32:09Z","title":"HRGCN: Heterogeneous Graph-level Anomaly Detection with Hierarchical\n Relation-augmented Graph Neural Networks","summary":" This work considers the problem of heterogeneous graph-level anomaly\ndetection. Heterogeneous graphs are commonly used to represent behaviours\nbetween different types of entities in complex industrial systems for capturing\nas much information about the system operations as possible. Detecting\nanomalous heterogeneous graphs from a large set of system behaviour graphs is\ncrucial for many real-world applications like online web/mobile service and\ncloud access control. To address the problem, we propose HRGCN, an unsupervised\ndeep heterogeneous graph neural network, to model complex heterogeneous\nrelations between different entities in the system for effectively identifying\nthese anomalous behaviour graphs. HRGCN trains a hierarchical\nrelation-augmented Heterogeneous Graph Neural Network (HetGNN), which learns\nbetter graph representations by modelling the interactions among all the system\nentities and considering both source-to-destination entity (node) types and\ntheir relation (edge) types. Extensive evaluation on two real-world application\ndatasets shows that HRGCN outperforms state-of-the-art competing anomaly\ndetection approaches. We further present a real-world industrial case study to\njustify the effectiveness of HRGCN in detecting anomalous (e.g., congested)\nnetwork devices in a mobile communication service. HRGCN is available at\nhttps://github.com/jiaxililearn/HRGCN.\n","authors":["Jiaxi Li","Guansong Pang","Ling Chen","Mohammad-Reza Namazi-Rad"],"pdf_url":"https://arxiv.org/pdf/2308.14340v1.pdf","comment":"12 pages, 10 figures, 6 tables. Accepted"},{"id":"http://arxiv.org/abs/2308.14338v1","updated":"2023-08-28T06:31:37Z","published":"2023-08-28T06:31:37Z","title":"Fair Few-shot Learning with Auxiliary Sets","summary":" Recently, there has been a growing interest in developing machine learning\n(ML) models that can promote fairness, i.e., eliminating biased predictions\ntowards certain populations (e.g., individuals from a specific demographic\ngroup). Most existing works learn such models based on well-designed fairness\nconstraints in optimization. Nevertheless, in many practical ML tasks, only\nvery few labeled data samples can be collected, which can lead to inferior\nfairness performance. This is because existing fairness constraints are\ndesigned to restrict the prediction disparity among different sensitive groups,\nbut with few samples, it becomes difficult to accurately measure the disparity,\nthus rendering ineffective fairness optimization. In this paper, we define the\nfairness-aware learning task with limited training samples as the \\emph{fair\nfew-shot learning} problem. To deal with this problem, we devise a novel\nframework that accumulates fairness-aware knowledge across different\nmeta-training tasks and then generalizes the learned knowledge to meta-test\ntasks. To compensate for insufficient training samples, we propose an essential\nstrategy to select and leverage an auxiliary set for each meta-test task. These\nauxiliary sets contain several labeled training samples that can enhance the\nmodel performance regarding fairness in meta-test tasks, thereby allowing for\nthe transfer of learned useful fairness-oriented knowledge to meta-test tasks.\nFurthermore, we conduct extensive experiments on three real-world datasets to\nvalidate the superiority of our framework against the state-of-the-art\nbaselines.\n","authors":["Song Wang","Jing Ma","Lu Cheng","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2308.14338v1.pdf","comment":"ECAI 2023"},{"id":"http://arxiv.org/abs/2102.03895v5","updated":"2023-08-28T06:26:04Z","published":"2021-02-07T19:29:28Z","title":"Functional optimal transport: map estimation and domain adaptation for\n functional data","summary":" We introduce a formulation of optimal transport problem for distributions on\nfunction spaces, where the stochastic map between functional domains can be\npartially represented in terms of an (infinite-dimensional) Hilbert-Schmidt\noperator mapping a Hilbert space of functions to another. For numerous machine\nlearning tasks, data can be naturally viewed as samples drawn from spaces of\nfunctions, such as curves and surfaces, in high dimensions. Optimal transport\nfor functional data analysis provides a useful framework of treatment for such\ndomains. { Since probability measures in infinite dimensional spaces generally\nlack absolute continuity (that is, with respect to non-degenerate Gaussian\nmeasures), the Monge map in the standard optimal transport theory for finite\ndimensional spaces may not exist. Our approach to the optimal transport problem\nin infinite dimensions is by a suitable regularization technique -- we restrict\nthe class of transport maps to be a Hilbert-Schmidt space of operators.} To\nthis end, we develop an efficient algorithm for finding the stochastic\ntransport map between functional domains and provide theoretical guarantees on\nthe existence, uniqueness, and consistency of our estimate for the\nHilbert-Schmidt operator. We validate our method on synthetic datasets and\nexamine the functional properties of the transport map. Experiments on\nreal-world datasets of robot arm trajectories further demonstrate the\neffectiveness of our method on applications in domain adaptation.\n","authors":["Jiacheng Zhu","Aritra Guha","Dat Do","Mengdi Xu","XuanLong Nguyen","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2102.03895v5.pdf","comment":"48 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.14333v1","updated":"2023-08-28T06:22:43Z","published":"2023-08-28T06:22:43Z","title":"DiffSmooth: Certifiably Robust Learning via Diffusion Models and Local\n Smoothing","summary":" Diffusion models have been leveraged to perform adversarial purification and\nthus provide both empirical and certified robustness for a standard model. On\nthe other hand, different robustly trained smoothed models have been studied to\nimprove the certified robustness. Thus, it raises a natural question: Can\ndiffusion model be used to achieve improved certified robustness on those\nrobustly trained smoothed models? In this work, we first theoretically show\nthat recovered instances by diffusion models are in the bounded neighborhood of\nthe original instance with high probability; and the \"one-shot\" denoising\ndiffusion probabilistic models (DDPM) can approximate the mean of the generated\ndistribution of a continuous-time diffusion model, which approximates the\noriginal instance under mild conditions. Inspired by our analysis, we propose a\ncertifiably robust pipeline DiffSmooth, which first performs adversarial\npurification via diffusion models and then maps the purified instances to a\ncommon region via a simple yet effective local smoothing strategy. We conduct\nextensive experiments on different datasets and show that DiffSmooth achieves\nSOTA-certified robustness compared with eight baselines. For instance,\nDiffSmooth improves the SOTA-certified accuracy from $36.0\\%$ to $53.0\\%$ under\n$\\ell_2$ radius $1.5$ on ImageNet. The code is available at\n[https://github.com/javyduck/DiffSmooth].\n","authors":["Jiawei Zhang","Zhongzhu Chen","Huan Zhang","Chaowei Xiao","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2308.14333v1.pdf","comment":"Accepted in 32nd USENIX Security, 2023"},{"id":"http://arxiv.org/abs/2308.14328v1","updated":"2023-08-28T06:15:14Z","published":"2023-08-28T06:15:14Z","title":"Reinforcement Learning for Generative AI: A Survey","summary":" Deep Generative AI has been a long-standing essential topic in the machine\nlearning community, which can impact a number of application areas like text\ngeneration and computer vision. The major paradigm to train a generative model\nis maximum likelihood estimation, which pushes the learner to capture and\napproximate the target data distribution by decreasing the divergence between\nthe model distribution and the target distribution. This formulation\nsuccessfully establishes the objective of generative tasks, while it is\nincapable of satisfying all the requirements that a user might expect from a\ngenerative model. Reinforcement learning, serving as a competitive option to\ninject new training signals by creating new objectives that exploit novel\nsignals, has demonstrated its power and flexibility to incorporate human\ninductive bias from multiple angles, such as adversarial learning,\nhand-designed rules and learned reward model to build a performant model.\nThereby, reinforcement learning has become a trending research field and has\nstretched the limits of generative AI in both model design and application. It\nis reasonable to summarize and conclude advances in recent years with a\ncomprehensive review. Although there are surveys in different application areas\nrecently, this survey aims to shed light on a high-level review that spans a\nrange of application areas. We provide a rigorous taxonomy in this area and\nmake sufficient coverage on various models and applications. Notably, we also\nsurveyed the fast-developing large language model area. We conclude this survey\nby showing the potential directions that might tackle the limit of current\nmodels and expand the frontiers for generative AI.\n","authors":["Yuanjiang Cao","Lina Yao","Julian McAuley","Quan Z. Sheng"],"pdf_url":"https://arxiv.org/pdf/2308.14328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13268v2","updated":"2023-08-28T06:08:49Z","published":"2023-02-26T08:43:08Z","title":"Revolutionizing Genomics with Reinforcement Learning Techniques","summary":" In recent years, Reinforcement Learning (RL) has emerged as a powerful tool\nfor solving a wide range of problems, including decision-making and genomics.\nThe exponential growth of raw genomic data over the past two decades has\nexceeded the capacity of manual analysis, leading to a growing interest in\nautomatic data analysis and processing. RL algorithms are capable of learning\nfrom experience with minimal human supervision, making them well-suited for\ngenomic data analysis and interpretation. One of the key benefits of using RL\nis the reduced cost associated with collecting labeled training data, which is\nrequired for supervised learning. While there have been numerous studies\nexamining the applications of Machine Learning (ML) in genomics, this survey\nfocuses exclusively on the use of RL in various genomics research fields,\nincluding gene regulatory networks (GRNs), genome assembly, and sequence\nalignment. We present a comprehensive technical overview of existing studies on\nthe application of RL in genomics, highlighting the strengths and limitations\nof these approaches. We then discuss potential research directions that are\nworthy of future exploration, including the development of more sophisticated\nreward functions as RL heavily depends on the accuracy of the reward function,\nthe integration of RL with other machine learning techniques, and the\napplication of RL to new and emerging areas in genomics research. Finally, we\npresent our findings and conclude by summarizing the current state of the field\nand the future outlook for RL in genomics.\n","authors":["Mohsen Karami","Roohallah Alizadehsani"," Khadijeh"," Jahanian","Ahmadreza Argha","Iman Dehzangi","Hamid Alinejad-Rokny"],"pdf_url":"https://arxiv.org/pdf/2302.13268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14322v1","updated":"2023-08-28T06:05:23Z","published":"2023-08-28T06:05:23Z","title":"Machine Unlearning Methodology base on Stochastic Teacher Network","summary":" The rise of the phenomenon of the \"right to be forgotten\" has prompted\nresearch on machine unlearning, which grants data owners the right to actively\nwithdraw data that has been used for model training, and requires the\nelimination of the contribution of that data to the model. A simple method to\nachieve this is to use the remaining data to retrain the model, but this is not\nacceptable for other data owners who continue to participate in training.\nExisting machine unlearning methods have been found to be ineffective in\nquickly removing knowledge from deep learning models. This paper proposes using\na stochastic network as a teacher to expedite the mitigation of the influence\ncaused by forgotten data on the model. We performed experiments on three\ndatasets, and the findings demonstrate that our approach can efficiently\nmitigate the influence of target data on the model within a single epoch. This\nallows for one-time erasure and reconstruction of the model, and the\nreconstruction model achieves the same performance as the retrained model.\n","authors":["Xulong Zhang","Jianzong Wang","Ning Cheng","Yifu Sun","Chuanyao Zhang","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14322v1.pdf","comment":"Accepted by 19th International Conference on Advanced Data Mining and\n Applications. (ADMA 2023)"},{"id":"http://arxiv.org/abs/2307.03380v3","updated":"2023-08-28T05:47:12Z","published":"2023-07-07T04:20:36Z","title":"On Formal Feature Attribution and Its Approximation","summary":" Recent years have witnessed the widespread use of artificial intelligence\n(AI) algorithms and machine learning (ML) models. Despite their tremendous\nsuccess, a number of vital problems like ML model brittleness, their fairness,\nand the lack of interpretability warrant the need for the active developments\nin explainable artificial intelligence (XAI) and formal ML model verification.\nThe two major lines of work in XAI include feature selection methods, e.g.\nAnchors, and feature attribution techniques, e.g. LIME and SHAP. Despite their\npromise, most of the existing feature selection and attribution approaches are\nsusceptible to a range of critical issues, including explanation unsoundness\nand out-of-distribution sampling. A recent formal approach to XAI (FXAI)\nalthough serving as an alternative to the above and free of these issues\nsuffers from a few other limitations. For instance and besides the scalability\nlimitation, the formal approach is unable to tackle the feature attribution\nproblem. Additionally, a formal explanation despite being formally sound is\ntypically quite large, which hampers its applicability in practical settings.\nMotivated by the above, this paper proposes a way to apply the apparatus of\nformal XAI to the case of feature attribution based on formal explanation\nenumeration. Formal feature attribution (FFA) is argued to be advantageous over\nthe existing methods, both formal and non-formal. Given the practical\ncomplexity of the problem, the paper then proposes an efficient technique for\napproximating exact FFA. Finally, it offers experimental evidence of the\neffectiveness of the proposed approximate FFA in comparison to the existing\nfeature attribution algorithms not only in terms of feature importance and but\nalso in terms of their relative order.\n","authors":["Jinqiang Yu","Alexey Ignatiev","Peter J. Stuckey"],"pdf_url":"https://arxiv.org/pdf/2307.03380v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14308v1","updated":"2023-08-28T05:23:16Z","published":"2023-08-28T05:23:16Z","title":"Policy Diversity for Cooperative Agents","summary":" Standard cooperative multi-agent reinforcement learning (MARL) methods aim to\nfind the optimal team cooperative policy to complete a task. However there may\nexist multiple different ways of cooperating, which usually are very needed by\ndomain experts. Therefore, identifying a set of significantly different\npolicies can alleviate the task complexity for them. Unfortunately, there is a\ngeneral lack of effective policy diversity approaches specifically designed for\nthe multi-agent domain. In this work, we propose a method called\nMoment-Matching Policy Diversity to alleviate this problem. This method can\ngenerate different team policies to varying degrees by formalizing the\ndifference between team policies as the difference in actions of selected\nagents in different policies. Theoretically, we show that our method is a\nsimple way to implement a constrained optimization problem that regularizes the\ndifference between two trajectory distributions by using the maximum mean\ndiscrepancy. The effectiveness of our approach is demonstrated on a challenging\nteam-based shooter.\n","authors":["Mingxi Tan","Andong Tian","Ludovic Denoyer"],"pdf_url":"https://arxiv.org/pdf/2308.14308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.05490v7","updated":"2023-08-28T05:03:48Z","published":"2022-04-12T02:49:27Z","title":"Continuous-Time User Preference Modelling for Temporal Sets Prediction","summary":" Given a sequence of sets, where each set has a timestamp and contains an\narbitrary number of elements, temporal sets prediction aims to predict the\nelements in the subsequent set. Previous studies for temporal sets prediction\nmainly focus on the modelling of elements and implicitly represent each user's\npreference based on his/her interacted elements. However, user preferences are\noften continuously evolving and the evolutionary trend cannot be fully captured\nwith the indirect learning paradigm of user preferences. To this end, we\npropose a continuous-time user preference modelling framework for temporal sets\nprediction, which explicitly models the evolving preference of each user by\nmaintaining a memory bank to store the states of all the users and elements.\nSpecifically, we first construct a universal sequence by arranging all the\nuser-set interactions in a non-descending temporal order, and then\nchronologically learn from each user-set interaction. For each interaction, we\ncontinuously update the memories of the related user and elements based on\ntheir currently encoded messages and past memories. Moreover, we present a\npersonalized user behavior learning module to discover user-specific\ncharacteristics based on each user's historical sequence, which aggregates the\npreviously interacted elements from dual perspectives according to the user and\nelements. Finally, we develop a set-batch algorithm to improve the model\nefficiency, which can create time-consistent batches in advance and achieve\n3.5x and 3.0x speedups in the training and evaluation process on average.\nExperiments on four real-world datasets demonstrate the superiority of our\napproach over state-of-the-arts under both transductive and inductive settings.\nThe good interpretability of our method is also shown.\n","authors":["Le Yu","Zihang Liu","Leilei Sun","Bowen Du","Chuanren Liu","Weifeng Lv"],"pdf_url":"https://arxiv.org/pdf/2204.05490v7.pdf","comment":"Accepted by the TKDE journal"},{"id":"http://arxiv.org/abs/2308.03312v3","updated":"2023-08-28T04:53:52Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":" Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.13799v3","updated":"2023-08-28T04:52:53Z","published":"2022-02-28T13:48:41Z","title":"One-shot Ultra-high-Resolution Generative Adversarial Network That\n Synthesizes 16K Images On A Single GPU","summary":" We propose a one-shot ultra-high-resolution generative adversarial network\n(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images\nfrom a single training image and is trainable on a single consumer GPU. OUR-GAN\ngenerates an initial image that is visually plausible and varied in shape at\nlow resolution, and then gradually increases the resolution by adding detail\nthrough super-resolution. Since OUR-GAN learns from a real\nultra-high-resolution (UHR) image, it can synthesize large shapes with fine\ndetails and long-range coherence, which is difficult to achieve with\nconventional generative models that rely on the patch distribution learned from\nrelatively small images. OUR-GAN can synthesize high-quality 16K images with\n12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR\nimage part by part through seamless subregion-wise super-resolution.\nAdditionally, OUR-GAN improves visual coherence while maintaining diversity by\napplying vertical positional convolution. In experiments on the ST4K and RAISE\ndatasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity\ncompared with the baseline one-shot synthesis models. To the best of our\nknowledge, OUR-GAN is the first one-shot image synthesizer that generates\nnon-repetitive UHR images on a single consumer GPU. The synthesized image\nsamples are presented at https://our-gan.github.io.\n","authors":["Junseok Oh","Donghwee Yoon","Injung Kim"],"pdf_url":"https://arxiv.org/pdf/2202.13799v3.pdf","comment":"36 pages, 26 figures"},{"id":"http://arxiv.org/abs/2303.12091v2","updated":"2023-08-28T04:50:57Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11760v2","updated":"2023-08-28T04:50:40Z","published":"2022-11-21T07:26:56Z","title":"A Low Latency Adaptive Coding Spiking Framework for Deep Reinforcement\n Learning","summary":" In recent years, spiking neural networks (SNNs) have been used in\nreinforcement learning (RL) due to their low power consumption and event-driven\nfeatures. However, spiking reinforcement learning (SRL), which suffers from\nfixed coding methods, still faces the problems of high latency and poor\nversatility. In this paper, we use learnable matrix multiplication to encode\nand decode spikes, improving the flexibility of the coders and thus reducing\nlatency. Meanwhile, we train the SNNs using the direct training method and use\ntwo different structures for online and offline RL algorithms, which gives our\nmodel a wider range of applications. Extensive experiments have revealed that\nour method achieves optimal performance with ultra-low latency (as low as 0.8%\nof other SRL methods) and excellent energy efficiency (up to 5X the DNNs) in\ndifferent algorithms and different environments.\n","authors":["Lang Qin","Rui Yan","Huajin Tang"],"pdf_url":"https://arxiv.org/pdf/2211.11760v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14853v2","updated":"2023-08-28T04:46:52Z","published":"2023-06-26T17:07:54Z","title":"Near-Optimal Nonconvex-Strongly-Convex Bilevel Optimization with Fully\n First-Order Oracles","summary":" Bilevel optimization has wide applications such as hyperparameter tuning,\nneural architecture search, and meta-learning. Designing efficient algorithms\nfor bilevel optimization is challenging because the lower-level problem defines\na feasibility set implicitly via another optimization problem. In this work, we\nconsider one tractable case when the lower-level problem is strongly convex.\nRecent works show that with a Hessian-vector product oracle, one can provably\nfind an $\\epsilon$-first-order stationary point within\n$\\tilde{\\mathcal{O}}(\\epsilon^{-2})$ oracle calls. However, Hessian-vector\nproduct may be inaccessible or expensive in practice. Kwon et al. (ICML 2023)\naddressed this issue by proposing a first-order method that can achieve the\nsame goal at a slower rate of $\\tilde{\\mathcal{O}}(\\epsilon^{-3})$. In this\nwork, we provide a tighter analysis demonstrating that this method can converge\nat the near-optimal $\\tilde {\\mathcal{O}}(\\epsilon^{-2})$ rate as second-order\nmethods. Our analysis further leads to simple first-order algorithms that\nachieve similar convergence rates for finding second-order stationary points\nand for distributed bilevel problems.\n","authors":["Lesi Chen","Yaohua Ma","Jingzhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.14853v2.pdf","comment":"slightly change the title"},{"id":"http://arxiv.org/abs/2308.14304v1","updated":"2023-08-28T04:37:38Z","published":"2023-08-28T04:37:38Z","title":"Solving Attention Kernel Regression Problem via Pre-conditioner","summary":" Large language models have shown impressive performance in many tasks. One of\nthe major features from the computation perspective is computing the attention\nmatrix. Previous works [Zandieh, Han, Daliri, and Karba 2023, Alman and Song\n2023] have formally studied the possibility and impossibility of approximating\nthe attention matrix. In this work, we define and study a new problem which is\ncalled the attention kernel regression problem. We show how to solve the\nattention kernel regression in the input sparsity time of the data matrix.\n","authors":["Zhao Song","Junze Yin","Lichen Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14295v1","updated":"2023-08-28T04:29:49Z","published":"2023-08-28T04:29:49Z","title":"Traffic Light Control with Reinforcement Learning","summary":" Traffic light control is important for reducing congestion in urban mobility\nsystems. This paper proposes a real-time traffic light control method using\ndeep Q learning. Our approach incorporates a reward function considering queue\nlengths, delays, travel time, and throughput. The model dynamically decides\nphase changes based on current traffic conditions. The training of the deep Q\nnetwork involves an offline stage from pre-generated data with fixed schedules\nand an online stage using real-time traffic data. A deep Q network structure\nwith a \"phase gate\" component is used to simplify the model's learning task\nunder different phases. A \"memory palace\" mechanism is used to address sample\nimbalance during the training process. We validate our approach using both\nsynthetic and real-world traffic flow data on a road intersecting in Hangzhou,\nChina. Results demonstrate significant performance improvements of the proposed\nmethod in reducing vehicle waiting time (57.1% to 100%), queue lengths (40.9%\nto 100%), and total travel time (16.8% to 68.0%) compared to traditional fixed\nsignal plans.\n","authors":["Taoyu Pan"],"pdf_url":"https://arxiv.org/pdf/2308.14295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13298v2","updated":"2023-08-28T04:21:38Z","published":"2023-08-25T10:47:37Z","title":"Federated Linear Bandit Learning via Over-the-Air Computation","summary":" In this paper, we investigate federated contextual linear bandit learning\nwithin a wireless system that comprises a server and multiple devices. Each\ndevice interacts with the environment, selects an action based on the received\nreward, and sends model updates to the server. The primary objective is to\nminimize cumulative regret across all devices within a finite time horizon. To\nreduce the communication overhead, devices communicate with the server via\nover-the-air computation (AirComp) over noisy fading channels, where the\nchannel noise may distort the signals. In this context, we propose a customized\nfederated linear bandits scheme, where each device transmits an analog signal,\nand the server receives a superposition of these signals distorted by channel\nnoise. A rigorous mathematical analysis is conducted to determine the regret\nbound of the proposed scheme. Both theoretical analysis and numerical\nexperiments demonstrate the competitive performance of our proposed scheme in\nterms of regret bounds in various settings.\n","authors":["Jiali Wang","Yuning Jiang","Xin Liu","Ting Wang","Yuanming Shi"],"pdf_url":"https://arxiv.org/pdf/2308.13298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14272v1","updated":"2023-08-28T03:03:03Z","published":"2023-08-28T03:03:03Z","title":"Goodhart's Law Applies to NLP's Explanation Benchmarks","summary":" Despite the rising popularity of saliency-based explanations, the research\ncommunity remains at an impasse, facing doubts concerning their purpose,\nefficacy, and tendency to contradict each other. Seeking to unite the\ncommunity's efforts around common goals, several recent works have proposed\nevaluation metrics. In this paper, we critically examine two sets of metrics:\nthe ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics,\nfocusing our inquiry on natural language processing. First, we show that we can\ninflate a model's comprehensiveness and sufficiency scores dramatically without\naltering its predictions or explanations on in-distribution test inputs. Our\nstrategy exploits the tendency for extracted explanations and their complements\nto be \"out-of-support\" relative to each other and in-distribution inputs. Next,\nwe demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple\nmethod that encodes the label, even though EVAL-X is precisely motivated to\naddress such exploits. Our results raise doubts about the ability of current\nmetrics to guide explainability research, underscoring the need for a broader\nreassessment of what precisely these metrics are intended to capture.\n","authors":["Jennifer Hsia","Danish Pruthi","Aarti Singh","Zachary C. Lipton"],"pdf_url":"https://arxiv.org/pdf/2308.14272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.01079v4","updated":"2023-08-28T02:58:42Z","published":"2022-01-04T10:49:30Z","title":"Incomplete Multi-View Weak-Label Learning","summary":" A variety of modern applications exhibit multi-view multi-label learning,\nwhere each sample has multi-view features, and multiple labels are correlated\nvia common views. Current methods usually fail to directly deal with the\nsetting where only a subset of features and labels are observed for each\nsample, and ignore the presence of noisy views and imbalanced labels in\nreal-world problems. In this paper, we propose a novel method to overcome the\nlimitations. It jointly embeds incomplete views and weak labels into a\nlow-dimensional subspace with adaptive weights, and facilitates the difference\nbetween embedding weight matrices via auto-weighted Hilbert-Schmidt\nIndependence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively\nlearns view-wise importance for embedding to detect noisy views, and mitigates\nthe label imbalance problem by focal loss. Experimental results on four\nreal-world multi-view multi-label datasets demonstrate the effectiveness of the\nproposed method.\n","authors":["Zhiwei Li","Zijian Yang","Lu Sun","Mineichi Kudo","Kego Kimura"],"pdf_url":"https://arxiv.org/pdf/2201.01079v4.pdf","comment":"6 pages, 2 figures, conference"},{"id":"http://arxiv.org/abs/2308.14267v1","updated":"2023-08-28T02:49:07Z","published":"2023-08-28T02:49:07Z","title":"Unleash Model Potential: Bootstrapped Meta Self-supervised Learning","summary":" The long-term goal of machine learning is to learn general visual\nrepresentations from a small amount of data without supervision, mimicking\nthree advantages of human cognition: i) no need for labels, ii) robustness to\ndata scarcity, and iii) learning from experience. Self-supervised learning and\nmeta-learning are two promising techniques to achieve this goal, but they both\nonly partially capture the advantages and fail to address all the problems.\nSelf-supervised learning struggles to overcome the drawbacks of data scarcity,\nwhile ignoring prior knowledge that can facilitate learning and generalization.\nMeta-learning relies on supervised information and suffers from a bottleneck of\ninsufficient learning. To address these issues, we propose a novel Bootstrapped\nMeta Self-Supervised Learning (BMSSL) framework that aims to simulate the human\nlearning process. We first analyze the close relationship between meta-learning\nand self-supervised learning. Based on this insight, we reconstruct tasks to\nleverage the strengths of both paradigms, achieving advantages i and ii.\nMoreover, we employ a bi-level optimization framework that alternates between\nsolving specific tasks with a learned ability (first level) and improving this\nability (second level), attaining advantage iii. To fully harness its power, we\nintroduce a bootstrapped target based on meta-gradient to make the model its\nown teacher. We validate the effectiveness of our approach with comprehensive\ntheoretical and empirical study.\n","authors":["Jingyao Wang","Zeen Song","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.14267v1.pdf","comment":"submitted to NIPS"},{"id":"http://arxiv.org/abs/2308.14258v1","updated":"2023-08-28T02:25:11Z","published":"2023-08-28T02:25:11Z","title":"Breaking Boundaries: Distributed Domain Decomposition with Scalable\n Physics-Informed Neural PDE Solvers","summary":" Mosaic Flow is a novel domain decomposition method designed to scale\nphysics-informed neural PDE solvers to large domains. Its unique approach\nleverages pre-trained networks on small domains to solve partial differential\nequations on large domains purely through inference, resulting in high\nreusability. This paper presents an end-to-end parallelization of Mosaic Flow,\ncombining data parallel training and domain parallelism for inference on\nlarge-scale problems. By optimizing the network architecture and data parallel\ntraining, we significantly reduce the training time for learning the Laplacian\noperator to minutes on 32 GPUs. Moreover, our distributed domain decomposition\nalgorithm enables scalable inferences for solving the Laplace equation on\ndomains 4096 times larger than the training domain, demonstrating strong\nscaling while maintaining accuracy on 32 GPUs. The reusability of Mosaic Flow,\ncombined with the improved performance achieved through the distributed-memory\nalgorithms, makes it a promising tool for modeling complex physical phenomena\nand accelerating scientific discovery.\n","authors":["Arthur Feeney","Zitong Li","Ramin Bostanabad","Aparna Chandramowlishwaran"],"pdf_url":"https://arxiv.org/pdf/2308.14258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14253v1","updated":"2023-08-28T02:10:38Z","published":"2023-08-28T02:10:38Z","title":"The Promise and Peril of Artificial Intelligence -- Violet Teaming\n Offers a Balanced Path Forward","summary":" Artificial intelligence (AI) promises immense benefits across sectors, yet\nalso poses risks from dual-use potentials, biases, and unintended behaviors.\nThis paper reviews emerging issues with opaque and uncontrollable AI systems\nand proposes an integrative framework called violet teaming to develop reliable\nand responsible AI. Violet teaming combines adversarial vulnerability probing\n(red teaming) with solutions for safety and security (blue teaming) while\nprioritizing ethics and social benefit. It emerged from AI safety research to\nmanage risks proactively by design. The paper traces the evolution of red,\nblue, and purple teaming toward violet teaming, and then discusses applying\nviolet techniques to address biosecurity risks of AI in biotechnology.\nAdditional sections review key perspectives across law, ethics, cybersecurity,\nmacrostrategy, and industry best practices essential for operationalizing\nresponsible AI through holistic technical and social considerations. Violet\nteaming provides both philosophy and method for steering AI trajectories toward\nsocietal good. With conscience and wisdom, the extraordinary capabilities of AI\ncan enrich humanity. But without adequate precaution, the risks could prove\ncatastrophic. Violet teaming aims to empower moral technology for the common\nwelfare.\n","authors":["Alexander J. Titus","Adam H. Russell"],"pdf_url":"https://arxiv.org/pdf/2308.14253v1.pdf","comment":"14 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.14250v1","updated":"2023-08-28T01:57:38Z","published":"2023-08-28T01:57:38Z","title":"Rule-Based Error Detection and Correction to Operationalize Movement\n Trajectory Classification","summary":" Classification of movement trajectories has many applications in\ntransportation. Supervised neural models represent the current\nstate-of-the-art. Recent security applications require this task to be rapidly\nemployed in environments that may differ from the data used to train such\nmodels for which there is little training data. We provide a neuro-symbolic\nrule-based framework to conduct error correction and detection of these models\nto support eventual deployment in security applications. We provide a suite of\nexperiments on several recent and state-of-the-art models and show an accuracy\nimprovement of 1.7% over the SOTA model in the case where all classes are\npresent in training and when 40% of classes are omitted from training, we\nobtain a 5.2% improvement (zero-shot) and 23.9% (few-shot) improvement over the\nSOTA model without resorting to retraining of the base model.\n","authors":["Bowen Xi","Kevin Scaria","Paulo Shakarian"],"pdf_url":"https://arxiv.org/pdf/2308.14250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11339v2","updated":"2023-08-28T01:50:00Z","published":"2023-08-22T10:36:56Z","title":"ProAgent: Building Proactive Cooperative AI with Large Language Models","summary":" Building AIs with adaptive behaviors in human-AI cooperation stands as a\npivotal focus in AGI research. Current methods for developing cooperative\nagents predominantly rely on learning-based methods, where policy\ngeneralization heavily hinges on past interactions with specific teammates.\nThese approaches constrain the agent's capacity to recalibrate its strategy\nwhen confronted with novel teammates. We propose \\textbf{ProAgent}, a novel\nframework that harnesses large language models (LLMs) to fashion a\n\\textit{pro}active \\textit{agent} empowered with the ability to anticipate\nteammates' forthcoming decisions and formulate enhanced plans for itself.\nProAgent excels at cooperative reasoning with the capacity to dynamically adapt\nits behavior to enhance collaborative efforts with teammates. Moreover, the\nProAgent framework exhibits a high degree of modularity and interpretability,\nfacilitating seamless integration to address a wide array of coordination\nscenarios. Experimental evaluations conducted within the framework of\n\\textit{Overcook-AI} unveil the remarkable performance superiority of ProAgent,\noutperforming five methods based on self-play and population-based training in\ncooperation with AI agents. Further, when cooperating with human proxy models,\nits performance exhibits an average improvement exceeding 10\\% compared to the\ncurrent state-of-the-art, COLE. The advancement was consistently observed\nacross diverse scenarios involving interactions with both AI agents of varying\ncharacteristics and human counterparts. These findings inspire future research\nfor human-robot collaborations. For a hands-on demonstration, please visit\n\\url{https://pku-proagent.github.io}.\n","authors":["Ceyao Zhang","Kaijie Yang","Siyi Hu","Zihao Wang","Guanghe Li","Yihang Sun","Cheng Zhang","Zhaowei Zhang","Anji Liu","Song-Chun Zhu","Xiaojun Chang","Junge Zhang","Feng Yin","Yitao Liang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11339v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04263v2","updated":"2023-08-28T01:39:00Z","published":"2023-08-08T13:59:56Z","title":"BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning","summary":" This paper introduces BarlowRL, a data-efficient reinforcement learning agent\nthat combines the Barlow Twins self-supervised learning framework with DER\n(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its\ncontrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids\ndimensional collapse by enforcing information spread to the whole space. This\nhelps RL algorithms to utilize uniformly spread state representation that\neventually results in a remarkable performance. The integration of Barlow Twins\nwith DER enhances data efficiency and achieves superior performance in the RL\ntasks. BarlowRL demonstrates the potential of incorporating self-supervised\nlearning techniques to improve RL algorithms.\n","authors":["Omer Veysel Cagatan","Baris Akgun"],"pdf_url":"https://arxiv.org/pdf/2308.04263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14245v1","updated":"2023-08-28T01:21:22Z","published":"2023-08-28T01:21:22Z","title":"A Comparison of Personalized and Generalized Approaches to Emotion\n Recognition Using Consumer Wearable Devices: Machine Learning Study","summary":" Background: Studies have shown the potential adverse health effects, ranging\nfrom headaches to cardiovascular disease, associated with long-term negative\nemotions and chronic stress. Since many indicators of stress are imperceptible\nto observers, the early detection and intervention of stress remains a pressing\nmedical need. Physiological signals offer a non-invasive method of monitoring\nemotions and are easily collected by smartwatches. Existing research primarily\nfocuses on developing generalized machine learning-based models for emotion\nclassification. Objective: We aim to study the differences between personalized\nand generalized machine learning models for three-class emotion classification\n(neutral, stress, and amusement) using wearable biosignal data. Methods: We\ndeveloped a convolutional encoder for the three-class emotion classification\nproblem using data from WESAD, a multimodal dataset with physiological signals\nfor 15 subjects. We compared the results between a subject-exclusive\ngeneralized, subject-inclusive generalized, and personalized model. Results:\nFor the three-class classification problem, our personalized model achieved an\naverage accuracy of 95.06% and F1-score of 91.71, our subject-inclusive\ngeneralized model achieved an average accuracy of 66.95% and F1-score of 42.50,\nand our subject-exclusive generalized model achieved an average accuracy of\n67.65% and F1-score of 43.05. Conclusions: Our results emphasize the need for\nincreased research in personalized emotion recognition models given that they\noutperform generalized models in certain contexts. We also demonstrate that\npersonalized machine learning models for emotion classification are viable and\ncan achieve high performance.\n","authors":["Joe Li","Peter Washington"],"pdf_url":"https://arxiv.org/pdf/2308.14245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17262v2","updated":"2023-08-28T01:17:32Z","published":"2022-10-31T12:36:37Z","title":"QNet: A Quantum-native Sequence Encoder Architecture","summary":" This work proposes QNet, a novel sequence encoder model that entirely\ninferences on the quantum computer using a minimum number of qubits. Let $n$\nand $d$ represent the length of the sequence and the embedding size,\nrespectively. The dot-product attention mechanism requires a time complexity of\n$O(n^2 \\cdot d)$, while QNet has merely $O(n+d)$ quantum circuit depth. In\naddition, we introduce ResQNet, a quantum-classical hybrid model composed of\nseveral QNet blocks linked by residual connections, as an isomorph Transformer\nEncoder. We evaluated our work on various natural language processing tasks,\nincluding text classification, rating score prediction, and named entity\nrecognition. Our models exhibit compelling performance over classical\nstate-of-the-art models with a thousand times fewer parameters. In summary,\nthis work investigates the advantage of machine learning on near-term quantum\ncomputers in sequential data by experimenting with natural language processing\ntasks.\n","authors":["Wei Day","Hao-Sheng Chen","Min-Te Sun"],"pdf_url":"https://arxiv.org/pdf/2210.17262v2.pdf","comment":"QCE23: 2023 IEEE International Conference on Quantum Computing &\n Engineering"},{"id":"http://arxiv.org/abs/2308.13111v2","updated":"2023-08-28T00:38:43Z","published":"2023-08-24T23:06:21Z","title":"Bayesian low-rank adaptation for large language models","summary":" Parameter-efficient fine-tuning (PEFT) has emerged as a new paradigm for\ncost-efficient fine-tuning of large language models (LLMs), with low-rank\nadaptation (LoRA) being a widely adopted choice. However, fine-tuned LLMs often\nbecome overconfident especially when fine-tuned on small datasets. Bayesian\nmethods, with their inherent ability to estimate uncertainty, serve as potent\ntools to mitigate overconfidence and enhance calibration. In this work, we\nintroduce Laplace-LoRA, a straightforward yet effective Bayesian method, which\napplies the Laplace approximation to the LoRA parameters and, considerably\nboosts the calibration of fine-tuned LLMs.\n","authors":["Adam X. Yang","Maxime Robeyns","Xi Wang","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2308.13111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09658v2","updated":"2023-08-28T00:36:11Z","published":"2023-03-16T21:31:55Z","title":"Energy Management of Multi-mode Plug-in Hybrid Electric Vehicle using\n Multi-agent Deep Reinforcement Learning","summary":" The recently emerging multi-mode plug-in hybrid electric vehicle (PHEV)\ntechnology is one of the pathways making contributions to decarbonization, and\nits energy management requires multiple-input and multipleoutput (MIMO)\ncontrol. At the present, the existing methods usually decouple the MIMO control\ninto singleoutput (MISO) control and can only achieve its local optimal\nperformance. To optimize the multi-mode vehicle globally, this paper studies a\nMIMO control method for energy management of the multi-mode PHEV based on\nmulti-agent deep reinforcement learning (MADRL). By introducing a relevance\nratio, a hand-shaking strategy is proposed to enable two learning agents to\nwork collaboratively under the MADRL framework using the deep deterministic\npolicy gradient (DDPG) algorithm. Unified settings for the DDPG agents are\nobtained through a sensitivity analysis of the influencing factors to the\nlearning performance. The optimal working mode for the hand-shaking strategy is\nattained through a parametric study on the relevance ratio. The advantage of\nthe proposed energy management method is demonstrated on a software-in-the-loop\ntesting platform. The result of the study indicates that the learning rate of\nthe DDPG agents is the greatest influencing factor for learning performance.\nUsing the unified DDPG settings and a relevance ratio of 0.2, the proposed\nMADRL system can save up to 4% energy compared to the single-agent learning\nsystem and up to 23.54% energy compared to the conventional rule-based system.\n","authors":["Min Hua","Cetengfei Zhang","Fanggang Zhang","Zhi Li","Xiaoli Yu","Hongming Xu","Quan Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.09658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14239v1","updated":"2023-08-28T00:34:40Z","published":"2023-08-28T00:34:40Z","title":"Quantum Next Generation Reservoir Computing: An Efficient Quantum\n Algorithm for Forecasting Quantum Dynamics","summary":" Next Generation Reservoir Computing (NG-RC) is a modern class of model-free\nmachine learning that enables an accurate forecasting of time series data\ngenerated by dynamical systems. We demonstrate that NG-RC can accurately\npredict full many-body quantum dynamics, instead of merely concentrating on the\ndynamics of observables, which is the conventional application of reservoir\ncomputing. In addition, we apply a technique which we refer to as skipping\nahead to predict far future states accurately without the need to extract\ninformation about the intermediate states. However, adopting a classical NG-RC\nfor many-body quantum dynamics prediction is computationally prohibitive due to\nthe large Hilbert space of sample input data. In this work, we propose an\nend-to-end quantum algorithm for many-body quantum dynamics forecasting with a\nquantum computational speedup via the block-encoding technique. This proposal\npresents an efficient model-free quantum scheme to forecast quantum dynamics\ncoherently, bypassing inductive biases incurred in a model-based approach.\n","authors":["Apimuk Sornsaeng","Ninnat Dangniam","Thiparat Chotibut"],"pdf_url":"https://arxiv.org/pdf/2308.14239v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14946v1","updated":"2023-08-28T23:55:23Z","published":"2023-08-28T23:55:23Z","title":"Reinforcement Learning for Sampling on Temporal Medical Imaging\n Sequences","summary":" Accelerated magnetic resonance imaging resorts to either Fourier-domain\nsubsampling or better reconstruction algorithms to deal with fewer measurements\nwhile still generating medical images of high quality. Determining the optimal\nsampling strategy given a fixed reconstruction protocol often has combinatorial\ncomplexity. In this work, we apply double deep Q-learning and REINFORCE\nalgorithms to learn the sampling strategy for dynamic image reconstruction. We\nconsider the data in the format of time series, and the reconstruction method\nis a pre-trained autoencoder-typed neural network. We present a proof of\nconcept that reinforcement learning algorithms are effective to discover the\noptimal sampling pattern which underlies the pre-trained reconstructor network\n(i.e., the dynamics in the environment). The code for replicating experiments\ncan be found at https://github.com/zhishenhuang/RLsamp.\n","authors":["Zhishen Huang"],"pdf_url":"https://arxiv.org/pdf/2308.14946v1.pdf","comment":"ICML 2023 Workshop SODS"},{"id":"http://arxiv.org/abs/2308.14945v1","updated":"2023-08-28T23:51:33Z","published":"2023-08-28T23:51:33Z","title":"Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals","summary":" We consider the problem of sampling from a distribution governed by a\npotential function. This work proposes an explicit score-based MCMC method that\nis deterministic, resulting in a deterministic evolution for particles rather\nthan a stochastic differential equation evolution. The score term is given in\nclosed form by a regularized Wasserstein proximal, using a kernel convolution\nthat is approximated by sampling. We demonstrate fast convergence on various\nproblems and show improved dimensional dependence of mixing time bounds for the\ncase of Gaussian distributions compared to the unadjusted Langevin algorithm\n(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally\nderive closed form expressions for the distributions at each iterate for\nquadratic potential functions, characterizing the variance reduction. Empirical\nresults demonstrate that the particles behave in an organized manner, lying on\nlevel set contours of the potential. Moreover, the posterior mean estimator of\nthe proposed method is shown to be closer to the maximum a-posteriori estimator\ncompared to ULA and MALA, in the context of Bayesian logistic regression.\n","authors":["Hong Ye Tan","Stanley Osher","Wuchen Li"],"pdf_url":"https://arxiv.org/pdf/2308.14945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14938v1","updated":"2023-08-28T23:33:07Z","published":"2023-08-28T23:33:07Z","title":"Entropy-based Guidance of Deep Neural Networks for Accelerated\n Convergence and Improved Performance","summary":" Neural networks have dramatically increased our capacity to learn from large,\nhigh-dimensional datasets across innumerable disciplines. However, their\ndecisions are not easily interpretable, their computational costs are high, and\nbuilding and training them are uncertain processes. To add structure to these\nefforts, we derive new mathematical results to efficiently measure the changes\nin entropy as fully-connected and convolutional neural networks process data,\nand introduce entropy-based loss terms. Experiments in image compression and\nimage classification on benchmark datasets demonstrate these losses guide\nneural networks to learn rich latent data representations in fewer dimensions,\nconverge in fewer training epochs, and achieve better test metrics.\n","authors":["Mackenzie J. Meni","Ryan T. White","Michael Mayo","Kevin Pilkiewicz"],"pdf_url":"https://arxiv.org/pdf/2308.14938v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2204.01585v4","updated":"2023-08-28T23:24:59Z","published":"2022-04-04T15:33:21Z","title":"Differentially Private Sampling from Rashomon Sets, and the Universality\n of Langevin Diffusion for Convex Optimization","summary":" In this paper we provide an algorithmic framework based on Langevin diffusion\n(LD) and its corresponding discretizations that allow us to simultaneously\nobtain: i) An algorithm for sampling from the exponential mechanism, whose\nprivacy analysis does not depend on convexity and which can be stopped at\nanytime without compromising privacy, and ii) tight uniform stability\nguarantees for the exponential mechanism. As a direct consequence, we obtain\noptimal excess empirical and population risk guarantees for (strongly) convex\nlosses under both pure and approximate differential privacy (DP). The framework\nallows us to design a DP uniform sampler from the Rashomon set. Rashomon sets\nare widely used in interpretable and robust machine learning, understanding\nvariable importance, and characterizing fairness.\n","authors":["Arun Ganesh","Abhradeep Thakurta","Jalaj Upadhyay"],"pdf_url":"https://arxiv.org/pdf/2204.01585v4.pdf","comment":"Appeared in COLT 2023. For ease of presentation, some results appear\n in the previous version of this paper on arXiv (v3) that do not appear in\n this version, nor are subsumed by results in this version. Please see Section\n 1.4 for more details"},{"id":"http://arxiv.org/abs/2306.16740v3","updated":"2023-08-28T23:19:14Z","published":"2023-06-29T07:31:43Z","title":"Principles and Guidelines for Evaluating Social Robot Navigation\n Algorithms","summary":" A major challenge to deploying robots widely is navigation in human-populated\nenvironments, commonly referred to as social robot navigation. While the field\nof social navigation has advanced tremendously in recent years, the fair\nevaluation of algorithms that tackle social navigation remains hard because it\ninvolves not just robotic agents moving in static environments but also dynamic\nhuman agents and their perceptions of the appropriateness of robot behavior. In\ncontrast, clear, repeatable, and accessible benchmarks have accelerated\nprogress in fields like computer vision, natural language processing and\ntraditional robot navigation by enabling researchers to fairly compare\nalgorithms, revealing limitations of existing solutions and illuminating\npromising new directions. We believe the same approach can benefit social\nnavigation. In this paper, we pave the road towards common, widely accessible,\nand repeatable benchmarking criteria to evaluate social robot navigation. Our\ncontributions include (a) a definition of a socially navigating robot as one\nthat respects the principles of safety, comfort, legibility, politeness, social\ncompetency, agent understanding, proactivity, and responsiveness to context,\n(b) guidelines for the use of metrics, development of scenarios, benchmarks,\ndatasets, and simulators to evaluate social navigation, and (c) a design of a\nsocial navigation metrics framework to make it easier to compare results from\ndifferent simulators, robots and datasets.\n","authors":["Anthony Francis","Claudia Pérez-D'Arpino","Chengshu Li","Fei Xia","Alexandre Alahi","Rachid Alami","Aniket Bera","Abhijat Biswas","Joydeep Biswas","Rohan Chandra","Hao-Tien Lewis Chiang","Michael Everett","Sehoon Ha","Justin Hart","Jonathan P. How","Haresh Karnan","Tsang-Wei Edward Lee","Luis J. Manso","Reuth Mirksy","Sören Pirk","Phani Teja Singamaneni","Peter Stone","Ada V. Taylor","Peter Trautman","Nathan Tsoi","Marynel Vázquez","Xuesu Xiao","Peng Xu","Naoki Yokoyama","Alexander Toshev","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2306.16740v3.pdf","comment":"42 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.14930v1","updated":"2023-08-28T23:08:32Z","published":"2023-08-28T23:08:32Z","title":"Application of Quantum Pre-Processing Filter for Binary Image\n Classification with Small Samples","summary":" Over the past few years, there has been significant interest in Quantum\nMachine Learning (QML) among researchers, as it has the potential to transform\nthe field of machine learning. Several models that exploit the properties of\nquantum mechanics have been developed for practical applications. In this\nstudy, we investigated the application of our previously proposed quantum\npre-processing filter (QPF) to binary image classification. We evaluated the\nQPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits\nand alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic\nsign images). Similar to our previous multi-class classification results, the\napplication of QPF improved the binary image classification accuracy using\nneural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8%\nto 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from\n93.5% to 92.0%. We then applied QPF in cases using a smaller number of training\nand testing samples, i.e. 80 and 20 samples per class, respectively. In order\nto derive statistically stable results, we conducted the experiment with 100\ntrials choosing randomly different training and testing samples and averaging\nthe results. The result showed that the application of QPF did not improve the\nimage classification accuracy against MNIST and EMNIST but improved it against\nCIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively.\nFurther research will be conducted as part of future work to investigate the\npotential of QPF to assess the scalability of the proposed approach to larger\nand complex datasets.\n","authors":["Farina Riaz","Shahab Abdulla","Hajime Suzuki","Srinjoy Ganguly","Ravinesh C. Deo","Susan Hopkins"],"pdf_url":"https://arxiv.org/pdf/2308.14930v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14929v1","updated":"2023-08-28T23:08:15Z","published":"2023-08-28T23:08:15Z","title":"Maestro: Uncovering Low-Rank Structures via Trainable Decomposition","summary":" Deep Neural Networks (DNNs) have been a large driver and enabler for AI\nbreakthroughs in recent years. These models have been getting larger in their\nattempt to become more accurate and tackle new upcoming use-cases, including\nAR/VR and intelligent assistants. However, the training process of such large\nmodels is a costly and time-consuming process, which typically yields a single\nmodel to fit all targets. To mitigate this, various techniques have been\nproposed in the literature, including pruning, sparsification or quantization\nof the model weights and updates. While able to achieve high compression rates,\nthey often incur computational overheads or accuracy penalties. Alternatively,\nfactorization methods have been leveraged to incorporate low-rank compression\nin the training process. Similarly, such techniques (e.g.,~SVD) frequently rely\non the computationally expensive decomposition of layers and are potentially\nsub-optimal for non-linear models, such as DNNs. In this work, we take a\nfurther step in designing efficient low-rank models and propose Maestro, a\nframework for trainable low-rank layers. Instead of regularly applying a priori\ndecompositions such as SVD, the low-rank structure is built into the training\nprocess through a generalized variant of Ordered Dropout. This method imposes\nan importance ordering via sampling on the decomposed DNN structure. Our\ntheoretical analysis demonstrates that our method recovers the SVD\ndecomposition of linear mapping on uniformly distributed data and PCA for\nlinear autoencoders. We further apply our technique on DNNs and empirically\nillustrate that Maestro enables the extraction of lower footprint models that\npreserve model performance while allowing for graceful accuracy-latency\ntradeoff for the deployment to devices of different capabilities.\n","authors":["Samuel Horvath","Stefanos Laskaridis","Shashank Rajput","Hongyi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14929v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2305.17118v2","updated":"2023-08-28T22:48:46Z","published":"2023-05-26T17:39:58Z","title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for\n LLM KV Cache Compression at Test Time","summary":" Large language models(LLMs) have sparked a new wave of exciting AI\napplications. Hosting these models at scale requires significant memory\nresources. One crucial memory bottleneck for the deployment stems from the\ncontext window. It is commonly recognized that model weights are memory hungry;\nhowever, the size of key-value embedding stored during the generation process\n(KV cache) can easily surpass the model size. The enormous size of the KV cache\nputs constraints on the inference batch size, which is crucial for high\nthroughput inference workload. Inspired by an interesting observation of the\nattention scores, we hypothesize the persistence of importance: only pivotal\ntokens, which had a substantial influence at one step, will significantly\ninfluence future generations. Based on our empirical verification and\ntheoretical analysis around this hypothesis, we propose Scissorhands, a system\nthat maintains the memory usage of the KV cache at a fixed budget without\nfinetuning the model. In essence, Scissorhands manages the KV cache by storing\nthe pivotal tokens with a higher probability. We validate that Scissorhands\nreduces the inference memory usage of the KV cache by up to 5X without\ncompromising model quality. We further demonstrate that Scissorhands can be\ncombined with 4-bit quantization, traditionally used to compress model weights,\nto achieve up to 20X compression.\n","authors":["Zichang Liu","Aditya Desai","Fangshuo Liao","Weitao Wang","Victor Xie","Zhaozhuo Xu","Anastasios Kyrillidis","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2305.17118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14924v1","updated":"2023-08-28T22:42:51Z","published":"2023-08-28T22:42:51Z","title":"Optimal Economic Gas Turbine Dispatch with Deep Reinforcement Learning","summary":" Dispatching strategies for gas turbines (GTs) are changing in modern\nelectricity grids. A growing incorporation of intermittent renewable energy\nrequires GTs to operate more but shorter cycles and more frequently on partial\nloads. Deep reinforcement learning (DRL) has recently emerged as a tool that\ncan cope with this development and dispatch GTs economically. The key\nadvantages of DRL are a model-free optimization and the ability to handle\nuncertainties, such as those introduced by varying loads or renewable energy\nproduction. In this study, three popular DRL algorithms are implemented for an\neconomic GT dispatch problem on a case study in Alberta, Canada. We highlight\nthe benefits of DRL by incorporating an existing thermodynamic software\nprovided by Siemens Energy into the environment model and by simulating\nuncertainty via varying electricity prices, loads, and ambient conditions.\nAmong the tested algorithms and baseline methods, Deep Q-Networks (DQN)\nobtained the highest rewards while Proximal Policy Optimization (PPO) was the\nmost sample efficient. We further propose and implement a method to assign GT\noperation and maintenance cost dynamically based on operating hours and cycles.\nCompared to existing methods, our approach better approximates the true cost of\nmodern GT dispatch and hence leads to more realistic policies.\n","authors":["Manuel Sage","Martin Staniszewski","Yaoyao Fiona Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14924v1.pdf","comment":"This work has been accepted to IFAC for publication under a Creative\n Commons Licence CC-BY-NC-ND"},{"id":"http://arxiv.org/abs/2308.14921v1","updated":"2023-08-28T22:32:05Z","published":"2023-08-28T22:32:05Z","title":"Gender bias and stereotypes in Large Language Models","summary":" Large Language Models (LLMs) have made substantial progress in the past\nseveral months, shattering state-of-the-art benchmarks in many domains. This\npaper investigates LLMs' behavior with respect to gender stereotypes, a known\nissue for prior models. We use a simple paradigm to test the presence of gender\nbias, building on but differing from WinoBias, a commonly used gender bias\ndataset, which is likely to be included in the training data of current LLMs.\nWe test four recently published LLMs and demonstrate that they express biased\nassumptions about men and women's occupations. Our contributions in this paper\nare as follows: (a) LLMs are 3-6 times more likely to choose an occupation that\nstereotypically aligns with a person's gender; (b) these choices align with\npeople's perceptions better than with the ground truth as reflected in official\njob statistics; (c) LLMs in fact amplify the bias beyond what is reflected in\nperceptions or the ground truth; (d) LLMs ignore crucial ambiguities in\nsentence structure 95% of the time in our study items, but when explicitly\nprompted, they recognize the ambiguity; (e) LLMs provide explanations for their\nchoices that are factually inaccurate and likely obscure the true reason behind\ntheir predictions. That is, they provide rationalizations of their biased\nbehavior. This highlights a key property of these models: LLMs are trained on\nimbalanced datasets; as such, even with the recent successes of reinforcement\nlearning with human feedback, they tend to reflect those imbalances back at us.\nAs with other types of societal biases, we suggest that LLMs must be carefully\ntested to ensure that they treat minoritized individuals and communities\nequitably.\n","authors":["Hadas Kotek","Rikker Dockum","David Q. Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14921v1.pdf","comment":"ACM Collective Intelligence"},{"id":"http://arxiv.org/abs/2308.14920v1","updated":"2023-08-28T22:29:57Z","published":"2023-08-28T22:29:57Z","title":"Matbench Discovery -- An evaluation framework for machine learning\n crystal stability prediction","summary":" Matbench Discovery simulates the deployment of machine learning (ML) energy\nmodels in a high-throughput search for stable inorganic crystals. We address\nthe disconnect between (i) thermodynamic stability and formation energy and\n(ii) in-domain vs out-of-distribution performance. Alongside this paper, we\npublish a Python package to aid with future model submissions and a growing\nonline leaderboard with further insights into trade-offs between various\nperformance metrics. To answer the question which ML methodology performs best\nat materials discovery, our initial release explores a variety of models\nincluding random forests, graph neural networks (GNN), one-shot predictors,\niterative Bayesian optimizers and universal interatomic potentials (UIP).\nRanked best-to-worst by their test set F1 score on thermodynamic stability\nprediction, we find CHGNet > M3GNet > MACE > ALIGNN > MEGNet > CGCNN > CGCNN+P\n> Wrenformer > BOWSR > Voronoi tessellation fingerprints with random forest.\nThe top 3 models are UIPs, the winning methodology for ML-guided materials\ndiscovery, achieving F1 scores of ~0.6 for crystal stability classification and\ndiscovery acceleration factors (DAF) of up to 5x on the first 10k most stable\npredictions compared to dummy selection from our test set. We also highlight a\nsharp disconnect between commonly used global regression metrics and more\ntask-relevant classification metrics. Accurate regressors are susceptible to\nunexpectedly high false-positive rates if those accurate predictions lie close\nto the decision boundary at 0 eV/atom above the convex hull where most\nmaterials are. Our results highlight the need to focus on classification\nmetrics that actually correlate with improved stability hit rate.\n","authors":["Janosh Riebesell","Rhys E. A. Goodall","Anubhav Jain","Philipp Benner","Kristin A. Persson","Alpha A. Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14920v1.pdf","comment":"18 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.14919v1","updated":"2023-08-28T22:29:16Z","published":"2023-08-28T22:29:16Z","title":"On Reward Structures of Markov Decision Processes","summary":" A Markov decision process can be parameterized by a transition kernel and a\nreward function. Both play essential roles in the study of reinforcement\nlearning as evidenced by their presence in the Bellman equations. In our\ninquiry of various kinds of ``costs'' associated with reinforcement learning\ninspired by the demands in robotic applications, rewards are central to\nunderstanding the structure of a Markov decision process and reward-centric\nnotions can elucidate important concepts in reinforcement learning.\nSpecifically, we studied the sample complexity of policy evaluation and\ndeveloped a novel estimator with an instance-specific error bound of\n$\\tilde{O}(\\sqrt{\\frac{\\tau_s}{n}})$ for estimating a single state value. Under\nthe online regret minimization setting, we refined the transition-based MDP\nconstant, diameter, into a reward-based constant, maximum expected hitting\ncost, and with it, provided a theoretical explanation for how a well-known\ntechnique, potential-based reward shaping, could accelerate learning with\nexpert knowledge. In an attempt to study safe reinforcement learning, we\nmodeled hazardous environments with irrecoverability and proposed a\nquantitative notion of safe learning via reset efficiency. In this setting, we\nmodified a classic algorithm to account for resets achieving promising\npreliminary numerical results. Lastly, for MDPs with multiple reward functions,\nwe developed a planning algorithm that computationally efficiently finds Pareto\noptimal stochastic policies.\n","authors":["Falcon Z. Dai"],"pdf_url":"https://arxiv.org/pdf/2308.14919v1.pdf","comment":"This PhD thesis draws heavily from arXiv:1907.02114 and\n arXiv:2002.06299"},{"id":"http://arxiv.org/abs/2308.14916v1","updated":"2023-08-28T22:26:50Z","published":"2023-08-28T22:26:50Z","title":"RecRec: Algorithmic Recourse for Recommender Systems","summary":" Recommender systems play an essential role in the choices people make in\ndomains such as entertainment, shopping, food, news, employment, and education.\nThe machine learning models underlying these recommender systems are often\nenormously large and black-box in nature for users, content providers, and\nsystem developers alike. It is often crucial for all stakeholders to understand\nthe model's rationale behind making certain predictions and recommendations.\nThis is especially true for the content providers whose livelihoods depend on\nthe recommender system. Drawing motivation from the practitioners' need, in\nthis work, we propose a recourse framework for recommender systems, targeted\ntowards the content providers. Algorithmic recourse in the recommendation\nsetting is a set of actions that, if executed, would modify the recommendations\n(or ranking) of an item in the desired manner. A recourse suggests actions of\nthe form: \"if a feature changes X to Y, then the ranking of that item for a set\nof users will change to Z.\" Furthermore, we demonstrate that RecRec is highly\neffective in generating valid, sparse, and actionable recourses through an\nempirical evaluation of recommender systems trained on three real-world\ndatasets. To the best of our knowledge, this work is the first to conceptualize\nand empirically test a generalized framework for generating recourses for\nrecommender systems.\n","authors":["Sahil Verma","Ashudeep Singh","Varich Boonsanong","John P. Dickerson","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14916v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2206.08464v2","updated":"2023-08-28T22:09:07Z","published":"2022-06-16T22:03:35Z","title":"PRANC: Pseudo RAndom Networks for Compacting deep models","summary":" We demonstrate that a deep model can be reparametrized as a linear\ncombination of several randomly initialized and frozen deep models in the\nweight space. During training, we seek local minima that reside within the\nsubspace spanned by these random models (i.e., `basis' networks). Our\nframework, PRANC, enables significant compaction of a deep model. The model can\nbe reconstructed using a single scalar `seed,' employed to generate the\npseudo-random `basis' networks, together with the learned linear mixture\ncoefficients.\n In practical applications, PRANC addresses the challenge of efficiently\nstoring and communicating deep models, a common bottleneck in several\nscenarios, including multi-agent learning, continual learners, federated\nsystems, and edge devices, among others. In this study, we employ PRANC to\ncondense image classification models and compress images by compacting their\nassociated implicit neural networks. PRANC outperforms baselines with a large\nmargin on image classification when compressing a deep model almost $100$\ntimes. Moreover, we show that PRANC enables memory-efficient inference by\ngenerating layer-wise weights on the fly. The source code of PRANC is here:\n\\url{https://github.com/UCDvision/PRANC}\n","authors":["Parsa Nooralinejad","Ali Abbasi","Soroush Abbasi Koohpayegani","Kossar Pourahmadi Meibodi","Rana Muhammad Shahroz Khan","Soheil Kolouri","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2206.08464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00270v2","updated":"2023-08-28T21:54:46Z","published":"2022-12-31T19:08:49Z","title":"Discovery and Exploitation of Generalized Network Effects","summary":" Given a large graph with few node labels, how can we (a) identify whether\nthere is generalized network-effects (GNE) of the graph or not, (b) estimate\nGNE to explain the interrelations among node classes, and (c) exploit GNE to\nimprove downstream tasks such as predicting the unknown labels accurately and\nefficiently? The knowledge of GNE is valuable for various tasks like node\nclassification and targeted advertising. However, identifying and understanding\nGNE such as homophily, heterophily or their combination is challenging in\nreal-world graphs due to limited availability of node labels and noisy edges.\nWe propose NetEffect, a graph mining approach to address the above issues,\nenjoying the following properties: (i) Principled: a statistical test to\ndetermine the presence of GNE in a graph with few node labels; (ii) General and\nExplainable: a closed-form solution to estimate the specific type of GNE\nobserved; and (iii) Accurate and Scalable: the integration of GNE for accurate\nand fast node classification. Applied on public, real-world graphs, NetEffect\ndiscovers the unexpected absence of GNE in numerous graphs, which previously\nthought to exhibit heterophily. Further, we show that incorporating GNE is\neffective on node classification. On a large real-world graph with 1.6M nodes\nand 22.3M edges, NetEffect achieves over 7 times speedup (14 minutes vs. 2\nhours) compared to most competitors.\n","authors":["Meng-Chieh Lee","Shubhranshu Shekhar","Jaemin Yoo","Christos Faloutsos"],"pdf_url":"https://arxiv.org/pdf/2301.00270v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2304.12534v2","updated":"2023-08-28T21:30:20Z","published":"2023-04-25T03:00:18Z","title":"Mobilizing Personalized Federated Learning in Infrastructure-Less and\n Heterogeneous Environments via Random Walk Stochastic ADMM","summary":" This paper explores the challenges of implementing Federated Learning (FL) in\npractical scenarios featuring isolated nodes with data heterogeneity, which can\nonly be connected to the server through wireless links in an\ninfrastructure-less environment. To overcome these challenges, we propose a\nnovel mobilizing personalized FL approach, which aims to facilitate mobility\nand resilience. Specifically, we develop a novel optimization algorithm called\nRandom Walk Stochastic Alternating Direction Method of Multipliers (RWSADMM).\nRWSADMM capitalizes on the server's random movement toward clients and\nformulates local proximity among their adjacent clients based on hard\ninequality constraints rather than requiring consensus updates or introducing\nbias via regularization methods. To mitigate the computational burden on the\nclients, an efficient stochastic solver of the approximated optimization\nproblem is designed in RWSADMM, which provably converges to the stationary\npoint almost surely in expectation. Our theoretical and empirical results\ndemonstrate the provable fast convergence and substantial accuracy improvements\nachieved by RWSADMM compared to baseline methods, along with its benefits of\nreduced communication costs and enhanced scalability.\n","authors":["Ziba Parsons","Fei Dou","Houyi Du","Zheng Song","Jin Lu"],"pdf_url":"https://arxiv.org/pdf/2304.12534v2.pdf","comment":"28 pages, 7 figures, 3 tables, 1 algorithm. Proof details are\n provided in the main body of the paper"},{"id":"http://arxiv.org/abs/2308.14909v1","updated":"2023-08-28T21:25:05Z","published":"2023-08-28T21:25:05Z","title":"Pruning Self-Attention for Zero-Shot Multi-Speaker Text-to-Speech","summary":" For personalized speech generation, a neural text-to-speech (TTS) model must\nbe successfully implemented with limited data from a target speaker. To this\nend, the baseline TTS model needs to be amply generalized to out-of-domain data\n(i.e., target speaker's speech). However, approaches to address this\nout-of-domain generalization problem in TTS have yet to be thoroughly studied.\nIn this work, we propose an effective pruning method for a transformer known as\nsparse attention, to improve the TTS model's generalization abilities. In\nparticular, we prune off redundant connections from self-attention layers whose\nattention weights are below the threshold. To flexibly determine the pruning\nstrength for searching optimal degree of generalization, we also propose a new\ndifferentiable pruning method that allows the model to automatically learn the\nthresholds. Evaluations on zero-shot multi-speaker TTS verify the effectiveness\nof our method in terms of voice quality and speaker similarity.\n","authors":["Hyungchan Yoon","Changhwan Kim","Eunwoo Song","Hyun-Wook Yoon","Hong-Goo Kang"],"pdf_url":"https://arxiv.org/pdf/2308.14909v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.14906v1","updated":"2023-08-28T21:17:12Z","published":"2023-08-28T21:17:12Z","title":"BayOTIDE: Bayesian Online Multivariate Time series Imputation with\n functional decomposition","summary":" In real-world scenarios like traffic and energy, massive time-series data\nwith missing values and noises are widely observed, even sampled irregularly.\nWhile many imputation methods have been proposed, most of them work with a\nlocal horizon, which means models are trained by splitting the long sequence\ninto batches of fit-sized patches. This local horizon can make models ignore\nglobal trends or periodic patterns. More importantly, almost all methods assume\nthe observations are sampled at regular time stamps, and fail to handle complex\nirregular sampled time series arising from different applications. Thirdly,\nmost existing methods are learned in an offline manner. Thus, it is not\nsuitable for many applications with fast-arriving streaming data. To overcome\nthese limitations, we propose \\ours: Bayesian Online Multivariate Time series\nImputation with functional decomposition. We treat the multivariate time series\nas the weighted combination of groups of low-rank temporal factors with\ndifferent patterns. We apply a group of Gaussian Processes (GPs) with different\nkernels as functional priors to fit the factors. For computational efficiency,\nwe further convert the GPs into a state-space prior by constructing an\nequivalent stochastic differential equation (SDE), and developing a scalable\nalgorithm for online inference. The proposed method can not only handle\nimputation over arbitrary time stamps, but also offer uncertainty\nquantification and interpretability for the downstream application. We evaluate\nour method on both synthetic and real-world datasets.\n","authors":["Shikai Fang","Qingsong Wen","Shandian Zhe","Liang Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06826v2","updated":"2023-08-28T21:14:35Z","published":"2023-06-12T02:26:00Z","title":"When Do Annotator Demographics Matter? Measuring the Influence of\n Annotator Demographics with the POPQUORN Dataset","summary":" Annotators are not fungible. Their demographics, life experiences, and\nbackgrounds all contribute to how they label data. However, NLP has only\nrecently considered how annotator identity might influence their decisions.\nHere, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering,\nOffensiveness, text Rewriting, and politeness rating with demographic Nuance).\nPOPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a\nrepresentative sample regarding sex, age, and race as the US population.\nThrough a series of analyses, we show that annotators' background plays a\nsignificant role in their judgments. Further, our work shows that backgrounds\nnot previously considered in NLP (e.g., education), are meaningful and should\nbe considered. Our study suggests that understanding the background of\nannotators and collecting labels from a demographically balanced pool of crowd\nworkers is important to reduce the bias of datasets. The dataset, annotator\nbackground, and annotation interface are available at\nhttps://github.com/Jiaxin-Pei/potato-prolific-dataset .\n","authors":["Jiaxin Pei","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2306.06826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14904v1","updated":"2023-08-28T21:13:04Z","published":"2023-08-28T21:13:04Z","title":"Maturity-Aware Active Learning for Semantic Segmentation with\n Hierarchically-Adaptive Sample Assessment","summary":" Active Learning (AL) for semantic segmentation is challenging due to heavy\nclass imbalance and different ways of defining \"sample\" (pixels, areas, etc.),\nleaving the interpretation of the data distribution ambiguous. We propose\n\"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL\nmethod that benefits from a hierarchical approach to define a multiview data\ndistribution, which takes into account the different \"sample\" definitions\njointly, hence able to select the most impactful segmentation pixels with\ncomprehensive understanding. MADBAL also features a novel uncertainty\nformulation, where AL supporting modules are included to sense the features'\nmaturity whose weighted influence continuously contributes to the uncertainty\ndetection. In this way, MADBAL makes significant performance leaps even in the\nearly AL stage, hence reducing the training burden significantly. It\noutperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as\nverified in our extensive experiments.\n","authors":["Amirsaeed Yazdani","Xuelu Li","Vishal Monga"],"pdf_url":"https://arxiv.org/pdf/2308.14904v1.pdf","comment":"Accepted to the 34th British Machine Vision Conference (BMVC 2023)"},{"id":"http://arxiv.org/abs/2002.08907v3","updated":"2023-08-28T21:10:38Z","published":"2020-02-20T17:52:18Z","title":"Second-order Conditional Gradient Sliding","summary":" Constrained second-order convex optimization algorithms are the method of\nchoice when a high accuracy solution to a problem is needed, due to their local\nquadratic convergence. These algorithms require the solution of a constrained\nquadratic subproblem at every iteration. We present the \\emph{Second-Order\nConditional Gradient Sliding} (SOCGS) algorithm, which uses a projection-free\nalgorithm to solve the constrained quadratic subproblems inexactly. When the\nfeasible region is a polytope the algorithm converges quadratically in primal\ngap after a finite number of linearly convergent iterations. Once in the\nquadratic regime the SOCGS algorithm requires $\\mathcal{O}(\\log(\\log\n1/\\varepsilon))$ first-order and Hessian oracle calls and $\\mathcal{O}(\\log\n(1/\\varepsilon) \\log(\\log1/\\varepsilon))$ linear minimization oracle calls to\nachieve an $\\varepsilon$-optimal solution. This algorithm is useful when the\nfeasible region can only be accessed efficiently through a linear optimization\noracle, and computing first-order information of the function, although\npossible, is costly.\n","authors":["Alejandro Carderera","Sebastian Pokutta"],"pdf_url":"https://arxiv.org/pdf/2002.08907v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14902v1","updated":"2023-08-28T21:08:06Z","published":"2023-08-28T21:08:06Z","title":"Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in\n Recommendation Networks","summary":" Recommendation models are vital in delivering personalized user experiences\nby leveraging the correlation between multiple input features. However, deep\nlearning-based recommendation models often face challenges due to evolving user\nbehaviour and item features, leading to covariate shifts. Effective\ncross-feature learning is crucial to handle data distribution drift and\nadapting to changing user behaviour. Traditional feature interaction techniques\nhave limitations in achieving optimal performance in this context.\n This work introduces Ad-Rec, an advanced network that leverages feature\ninteraction techniques to address covariate shifts. This helps eliminate\nirrelevant interactions in recommendation tasks. Ad-Rec leverages masked\ntransformers to enable the learning of higher-order cross-features while\nmitigating the impact of data distribution drift. Our approach improves model\nquality, accelerates convergence, and reduces training time, as measured by the\nArea Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its\nability to achieve superior model quality through comprehensive ablation\nstudies.\n","authors":["Muhammad Adnan","Yassaman Ebrahimzadeh Maboud","Divya Mahajan","Prashant J. Nair"],"pdf_url":"https://arxiv.org/pdf/2308.14902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14897v1","updated":"2023-08-28T20:46:07Z","published":"2023-08-28T20:46:07Z","title":"Statistically Efficient Variance Reduction with Double Policy Estimation\n for Off-Policy Evaluation in Sequence-Modeled Reinforcement Learning","summary":" Offline reinforcement learning aims to utilize datasets of previously\ngathered environment-action interaction records to learn a policy without\naccess to the real environment. Recent work has shown that offline\nreinforcement learning can be formulated as a sequence modeling problem and\nsolved via supervised learning with approaches such as decision transformer.\nWhile these sequence-based methods achieve competitive results over\nreturn-to-go methods, especially on tasks that require longer episodes or with\nscarce rewards, importance sampling is not considered to correct the policy\nbias when dealing with off-policy data, mainly due to the absence of behavior\npolicy and the use of deterministic evaluation policies. To this end, we\npropose DPE: an RL algorithm that blends offline sequence modeling and offline\nreinforcement learning with Double Policy Estimation (DPE) in a unified\nframework with statistically proven properties on variance reduction. We\nvalidate our method in multiple tasks of OpenAI Gym with D4RL benchmarks. Our\nmethod brings a performance improvements on selected methods which outperforms\nSOTA baselines in several tasks, demonstrating the advantages of enabling\ndouble policy estimation for sequence-modeled reinforcement learning.\n","authors":["Hanhan Zhou","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2308.14897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14895v1","updated":"2023-08-28T20:32:22Z","published":"2023-08-28T20:32:22Z","title":"Conformal Meta-learners for Predictive Inference of Individual Treatment\n Effects","summary":" We investigate the problem of machine learning-based (ML) predictive\ninference on individual treatment effects (ITEs). Previous work has focused\nprimarily on developing ML-based meta-learners that can provide point estimates\nof the conditional average treatment effect (CATE); these are model-agnostic\napproaches for combining intermediate nuisance estimates to produce estimates\nof CATE. In this paper, we develop conformal meta-learners, a general framework\nfor issuing predictive intervals for ITEs by applying the standard conformal\nprediction (CP) procedure on top of CATE meta-learners. We focus on a broad\nclass of meta-learners based on two-stage pseudo-outcome regression and develop\na stochastic ordering framework to study their validity. We show that inference\nwith conformal meta-learners is marginally valid if their (pseudo outcome)\nconformity scores stochastically dominate oracle conformity scores evaluated on\nthe unobserved ITEs. Additionally, we prove that commonly used CATE\nmeta-learners, such as the doubly-robust learner, satisfy a model- and\ndistribution-free stochastic (or convex) dominance condition, making their\nconformal inferences valid for practically-relevant levels of target coverage.\nWhereas existing procedures conduct inference on nuisance parameters (i.e.,\npotential outcomes) via weighted CP, conformal meta-learners enable direct\ninference on the target parameter (ITE). Numerical experiments show that\nconformal meta-learners provide valid intervals with competitive efficiency\nwhile retaining the favorable point estimation properties of CATE\nmeta-learners.\n","authors":["Ahmed Alaa","Zaid Ahmad","Mark van der Laan"],"pdf_url":"https://arxiv.org/pdf/2308.14895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14893v1","updated":"2023-08-28T20:30:10Z","published":"2023-08-28T20:30:10Z","title":"When hard negative sampling meets supervised contrastive learning","summary":" State-of-the-art image models predominantly follow a two-stage strategy:\npre-training on large datasets and fine-tuning with cross-entropy loss. Many\nstudies have shown that using cross-entropy can result in sub-optimal\ngeneralisation and stability. While the supervised contrastive loss addresses\nsome limitations of cross-entropy loss by focusing on intra-class similarities\nand inter-class differences, it neglects the importance of hard negative\nmining. We propose that models will benefit from performance improvement by\nweighting negative samples based on their dissimilarity to positive\ncounterparts. In this paper, we introduce a new supervised contrastive learning\nobjective, SCHaNe, which incorporates hard negative sampling during the\nfine-tuning phase. Without requiring specialized architectures, additional\ndata, or extra computational resources, experimental results indicate that\nSCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various\nbenchmarks, with significant gains of up to $3.32\\%$ in few-shot learning\nsettings and $3.41\\%$ in full dataset fine-tuning. Importantly, our proposed\nobjective sets a new state-of-the-art for base models on ImageNet-1k, achieving\nan 86.14\\% accuracy. Furthermore, we demonstrate that the proposed objective\nyields better embeddings and explains the improved effectiveness observed in\nour experiments.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa","Zaiqiao Meng"],"pdf_url":"https://arxiv.org/pdf/2308.14893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15786v3","updated":"2023-08-28T20:29:41Z","published":"2023-05-25T07:01:02Z","title":"Theoretical Guarantees of Learning Ensembling Strategies with\n Applications to Time Series Forecasting","summary":" Ensembling is among the most popular tools in machine learning (ML) due to\nits effectiveness in minimizing variance and thus improving generalization.\nMost ensembling methods for black-box base learners fall under the umbrella of\n\"stacked generalization,\" namely training an ML algorithm that takes the\ninferences from the base learners as input. While stacking has been widely\napplied in practice, its theoretical properties are poorly understood. In this\npaper, we prove a novel result, showing that choosing the best stacked\ngeneralization from a (finite or finite-dimensional) family of stacked\ngeneralizations based on cross-validated performance does not perform \"much\nworse\" than the oracle best. Our result strengthens and significantly extends\nthe results in Van der Laan et al. (2007). Inspired by the theoretical\nanalysis, we further propose a particular family of stacked generalizations in\nthe context of probabilistic forecasting, each one with a different sensitivity\nfor how much the ensemble weights are allowed to vary across items, timestamps\nin the forecast horizon, and quantiles. Experimental results demonstrate the\nperformance gain of the proposed method.\n","authors":["Hilaf Hasson","Danielle C. Maddix","Yuyang Wang","Gaurav Gupta","Youngsuk Park"],"pdf_url":"https://arxiv.org/pdf/2305.15786v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2306.04930v2","updated":"2023-08-28T20:21:59Z","published":"2023-06-08T04:24:24Z","title":"When to Show a Suggestion? Integrating Human Feedback in AI-Assisted\n Programming","summary":" AI powered code-recommendation systems, such as Copilot and CodeWhisperer,\nprovide code suggestions inside a programmer's environment (e.g., an IDE) with\nthe aim to improve their productivity. Since, in these scenarios, programmers\naccept and reject suggestions, ideally, such a system should use this feedback\nin furtherance of this goal. In this work, we leverage prior data of\nprogrammers interacting with GitHub Copilot, a system used by millions of\nprogrammers, to develop interventions that can save programmer time. We propose\na utility theory framework, which models this interaction with programmers and\ndecides which suggestions to display. Our framework Conditional suggestion\nDisplay from Human Feedback (CDHF), relies on a cascade of models that predict\nsuggestion acceptance to selectively hide suggestions reducing both latency and\nprogrammer verification time. Using data from 535 programmers, we perform a\nretrospective evaluation of CDHF and show that we can avoid displaying a\nsignificant fraction of suggestions that would have been rejected doing so\nwithout total knowledge of the suggestions themselves. We further demonstrate\nthe importance of incorporating the programmer's latent unobserved state in\ndeciding when to display suggestions through ablations on user study data.\nFinally, we showcase that using suggestion acceptance as a reward signal to\nknow which suggestions to display leads to reduced quality suggestions\nindicating an unexpected pitfall.\n","authors":["Hussein Mozannar","Gagan Bansal","Adam Fourney","Eric Horvitz"],"pdf_url":"https://arxiv.org/pdf/2306.04930v2.pdf","comment":"Previous version of these results can be found in arXiv:2210.14306"},{"id":"http://arxiv.org/abs/2305.19370v3","updated":"2023-08-28T20:13:33Z","published":"2023-05-30T19:25:51Z","title":"Blockwise Parallel Transformer for Large Context Models","summary":" Transformers have emerged as the cornerstone of state-of-the-art natural\nlanguage processing models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands posed by the\nself-attention mechanism and the large feedforward network in Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving multiple long sequences or long-term dependencies. We present a\ndistinct approach, Blockwise Parallel Transformer (BPT), that leverages\nblockwise computation of self-attention and feedforward network fusion to\nminimize memory costs. By processing longer input sequences while maintaining\nmemory efficiency, BPT enables training sequences 32 times longer than vanilla\nTransformers and up to 4 times longer than previous memory-efficient methods.\nExtensive experiments on language modeling and reinforcement learning tasks\ndemonstrate the effectiveness of BPT in reducing memory requirements and\nimproving performance.\n","authors":["Hao Liu","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2305.19370v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.14524v1","updated":"2023-08-28T12:19:46Z","published":"2023-08-28T12:19:46Z","title":"Towards enabling reliable immersive teleoperation through Digital Twin:\n A UAV command and control use case","summary":" This paper addresses the challenging problem of enabling reliable immersive\nteleoperation in scenarios where an Unmanned Aerial Vehicle (UAV) is remotely\ncontrolled by an operator via a cellular network. Such scenarios can be quite\ncritical particularly when the UAV lacks advanced equipment (e.g., Lidar-based\nauto stop) or when the network is subject to some performance constraints\n(e.g., delay). To tackle these challenges, we propose a novel architecture\nleveraging Digital Twin (DT) technology to create a virtual representation of\nthe physical environment. This virtual environment accurately mirrors the\nphysical world, accounting for 3D surroundings, weather constraints, and\nnetwork limitations. To enhance teleoperation, the UAV in the virtual\nenvironment is equipped with advanced features that maybe absent in the real\nUAV. Furthermore, the proposed architecture introduces an intelligent logic\nthat utilizes information from both virtual and physical environments to\napprove, deny, or correct actions initiated by the UAV operator. This\nanticipatory approach helps to mitigate potential risks. Through a series of\nfield trials, we demonstrate the effectiveness of the proposed architecture in\nsignificantly improving the reliability of UAV teleoperation.\n","authors":["Nassim Sehad","Xinyi Tu","Akash Rajasekaran","Hamed Hellaoui","Riku Jäntti","Mérouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2308.14524v1.pdf","comment":"Accepted by IEEE Globecom 2023"},{"id":"http://arxiv.org/abs/2308.14480v1","updated":"2023-08-28T10:40:16Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14395v1","updated":"2023-08-28T08:20:30Z","published":"2023-08-28T08:20:30Z","title":"UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for\n Temporal Forgery Localization","summary":" The emergence of artificial intelligence-generated content (AIGC) has raised\nconcerns about the authenticity of multimedia content in various fields.\nHowever, existing research for forgery content detection has focused mainly on\nbinary classification tasks of complete videos, which has limited applicability\nin industrial settings. To address this gap, we propose UMMAFormer, a novel\nuniversal transformer framework for temporal forgery localization (TFL) that\npredicts forgery segments with multimodal adaptation. Our approach introduces a\nTemporal Feature Abnormal Attention (TFAA) module based on temporal feature\nreconstruction to enhance the detection of temporal differences. We also design\na Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the\nFeature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the\nproposed method, we contribute a novel Temporal Video Inpainting Localization\n(TVIL) dataset specifically tailored for video inpainting scenes. Our\nexperiments show that our approach achieves state-of-the-art performance on\nbenchmark datasets, including Lav-DF, TVIL, and Psynd, significantly\noutperforming previous methods. The code and data are available at\nhttps://github.com/ymhzyj/UMMAFormer/.\n","authors":["Rui Zhang","Hongxia Wang","Mingshan Du","Hanqing Liu","Yang Zhou","Qiang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.14395v1.pdf","comment":"11 pages, 8 figures, 66 references. This paper has been accepted for\n ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.14316v1","updated":"2023-08-28T05:38:43Z","published":"2023-08-28T05:38:43Z","title":"UniPT: Universal Parallel Tuning for Transfer Learning with Efficient\n Parameter and Memory","summary":" Fine-tuning pre-trained models has emerged as a powerful technique in\nnumerous domains, owing to its ability to leverage enormous pre-existing\nknowledge and achieve remarkable performance on downstream tasks. However,\nupdating the parameters of entire networks is computationally intensive.\nAlthough state-of-the-art parameter-efficient transfer learning (PETL) methods\nsignificantly reduce the trainable parameters and storage demand, almost all of\nthem still need to back-propagate the gradients through large pre-trained\nnetworks. This memory-extensive characteristic extremely limits the\napplicability of PETL methods in real-world scenarios. To this end, we propose\na new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT).\nSpecifically, we facilitate the transfer process via a lightweight learnable\nparallel network, which consists of two modules: 1) A parallel interaction\nmodule that decouples the inherently sequential connections and processes the\nintermediate activations detachedly of the pre-trained network. 2) A confidence\naggregation module that learns optimal strategies adaptively for integrating\ncross-layer features. We evaluate UniPT with different backbones (e.g.,\nVSE$\\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging\nvision-and-language tasks (i.e., image-text retrieval, video-text retrieval,\nvisual question answering, compositional question answering, and visual\ngrounding). Extensive ablations on ten datasets have validated that our UniPT\ncan not only dramatically reduce memory consumption and outperform the best\nmemory-efficient competitor, but also achieve higher performance than existing\nPETL methods in a low-memory scenario on different architectures. Our code is\npublicly available at: https://github.com/Paranioar/UniPT.\n","authors":["Haiwen Diao","Bo Wan","Ying Zhang","Xu Jia","Huchuan Lu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14316v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.06725v2","updated":"2023-08-28T04:27:35Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":" Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\nhttps://yuyangyin.github.io/CLEDiffusion/\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v2.pdf","comment":"Accepted In Proceedings of the 31st ACM International Conference on\n Multimedia (MM' 23)"},{"id":"http://arxiv.org/abs/2308.14274v1","updated":"2023-08-28T03:13:27Z","published":"2023-08-28T03:13:27Z","title":"Parameter-Efficient Transfer Learning for Audio-Visual-Language Tasks","summary":" The pretrain-then-finetune paradigm has been widely used in various unimodal\nand multimodal tasks. However, finetuning all the parameters of a pre-trained\nmodel becomes prohibitive as the model size grows exponentially. To address\nthis issue, the adapter mechanism that freezes the pre-trained model and only\nfinetunes a few extra parameters is introduced and delivers promising results.\nMost studies on adapter architectures are dedicated to unimodal or bimodal\ntasks, while the adapter architectures for trimodal tasks have not been\ninvestigated yet. This paper introduces a novel Long Short-Term Trimodal\nAdapter (LSTTA) approach for video understanding tasks involving audio, visual,\nand language modalities. Based on the pre-trained from the three modalities,\nthe designed adapter module is inserted between the sequential blocks to model\nthe dense interactions across the three modalities. Specifically, LSTTA\nconsists of two types of complementary adapter modules, namely the long-term\nsemantic filtering module and the short-term semantic interaction module. The\nlong-term semantic filtering aims to characterize the temporal importance of\nthe video frames and the short-term semantic interaction module models local\ninteractions within short periods. Compared to previous state-of-the-art\ntrimodal learning methods pre-trained on a large-scale trimodal corpus, LSTTA\nis more flexible and can inherit any powerful unimodal or bimodal models.\nExperimental results on four typical trimodal learning tasks show the\neffectiveness of LSTTA over existing state-of-the-art methods.\n","authors":["Hongye Liu","Xianhai Xie","Yang Gao","Size Li","Zhou YU"],"pdf_url":"https://arxiv.org/pdf/2308.14274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14263v1","updated":"2023-08-28T02:38:17Z","published":"2023-08-28T02:38:17Z","title":"Cross-Modal Retrieval: A Systematic Review of Methods and Future\n Directions","summary":" With the exponential surge in diverse multi-modal data, traditional uni-modal\nretrieval methods struggle to meet the needs of users demanding access to data\nfrom various modalities. To address this, cross-modal retrieval has emerged,\nenabling interaction across modalities, facilitating semantic matching, and\nleveraging complementarity and consistency between different modal data.\nAlthough prior literature undertook a review of the cross-modal retrieval\nfield, it exhibits numerous deficiencies pertaining to timeliness, taxonomy,\nand comprehensiveness. This paper conducts a comprehensive review of\ncross-modal retrieval's evolution, spanning from shallow statistical analysis\ntechniques to vision-language pre-training models. Commencing with a\ncomprehensive taxonomy grounded in machine learning paradigms, mechanisms, and\nmodels, the paper then delves deeply into the principles and architectures\nunderpinning existing cross-modal retrieval methods. Furthermore, it offers an\noverview of widely used benchmarks, metrics, and performances. Lastly, the\npaper probes the prospects and challenges that confront contemporary\ncross-modal retrieval, while engaging in a discourse on potential directions\nfor further progress in the field. To facilitate the research on cross-modal\nretrieval, we develop an open-source code repository at\nhttps://github.com/BMC-SDNU/Cross-Modal-Retrieval.\n","authors":["Lei Zhu","Tianshi Wang","Fengling Li","Jingjing Li","Zheng Zhang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14263v1.pdf","comment":null}]},"2023-08-27T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.10335v2","updated":"2023-08-27T23:06:07Z","published":"2023-08-20T18:36:28Z","title":"A Study on Robustness and Reliability of Large Language Model Code\n Generation","summary":" Recently, the large language models (LLMs) have shown extraordinary ability\nin understanding natural language and generating programming code. It has been\na common practice of software engineers to consult LLMs when encountering\ncoding questions. Although efforts have been made to avoid syntax errors and\nalign the code with the intended semantics, the reliability and robustness of\nthe code generationfrom LLMs have not yet been thoroughly studied. The\nexecutable code is not equivalent to the reliable and robust code, especially\nin the context of real-world software development. The misuse of APIs in the\ngenerated code could lead to severe problem, such as resource leaks, program\ncrashes. To make things worse, the users of LLM code generation services are\nactually the developers that are most vulnerable to these code that seems right\n-- They are always novice developers that are not familiar with the APIs that\nLLMs generate code for them. Therefore, they could hardly tell the misuse in\nthe code generated by LLMs, which further facilitates the incorrect code\napplied in real-world software. Existing code evaluation benchmark and datasets\nfocus on crafting small tasks such as programming questions in coding\ninterviews, which however deviates from the problem that developers would ask\nLLM for real-world coding help. To fill the missing piece, in this work, we\npropose a dataset RobustAPI for evaluating the reliability and robustness of\ncode generated by LLMs. We collect 1208 coding questions from StackOverflow on\n24 representative Java APIs. We summarize thecommon misuse patterns of these\nAPIs and evaluate them oncurrent popular LLMs. The evaluation results show that\nevenfor GPT-4, 62% of the generated code contains API misuses,which would cause\nunexpected consequences if the code isintroduced into real-world software.\n","authors":["Li Zhong","Zilong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14217v1","updated":"2023-08-27T22:35:27Z","published":"2023-08-27T22:35:27Z","title":"Generations of Knowledge Graphs: The Crazy Ideas and the Business Impact","summary":" Knowledge Graphs (KGs) have been used to support a wide range of\napplications, from web search to personal assistant. In this paper, we describe\nthree generations of knowledge graphs: entity-based KGs, which have been\nsupporting general search and question answering (e.g., at Google and Bing);\ntext-rich KGs, which have been supporting search and recommendations for\nproducts, bio-informatics, etc. (e.g., at Amazon and Alibaba); and the emerging\nintegration of KGs and LLMs, which we call dual neural KGs. We describe the\ncharacteristics of each generation of KGs, the crazy ideas behind the scenes in\nconstructing such KGs, and the techniques developed over time to enable\nindustry impact. In addition, we use KGs as examples to demonstrate a recipe to\nevolve research ideas from innovations to production practice, and then to the\nnext level of innovations, to advance both science and business.\n","authors":["Xin Luna Dong"],"pdf_url":"https://arxiv.org/pdf/2308.14217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14199v1","updated":"2023-08-27T20:24:33Z","published":"2023-08-27T20:24:33Z","title":"Symbolic and Language Agnostic Large Language Models","summary":" We argue that the relative success of large language models (LLMs) is not a\nreflection on the symbolic vs. subsymbolic debate but a reflection on employing\nan appropriate strategy of bottom-up reverse engineering of language at scale.\nHowever, due to the subsymbolic nature of these models whatever knowledge these\nsystems acquire about language will always be buried in millions of\nmicrofeatures (weights) none of which is meaningful on its own. Moreover, and\ndue to their stochastic nature, these models will often fail in capturing\nvarious inferential aspects that are prevalent in natural language. What we\nsuggest here is employing the successful bottom-up strategy in a symbolic\nsetting, producing symbolic, language agnostic and ontologically grounded large\nlanguage models.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2308.14199v1.pdf","comment":"4 pages - draft. arXiv admin note: substantial text overlap with\n arXiv:2306.00017"},{"id":"http://arxiv.org/abs/2308.14186v1","updated":"2023-08-27T19:22:12Z","published":"2023-08-27T19:22:12Z","title":"Empowering Cross-lingual Abilities of Instruction-tuned Large Language\n Models by Translation-following demonstrations","summary":" The language ability of Large Language Models (LLMs) is often unbalanced\ntowards English because of the imbalance in the distribution of the\npre-training data. This disparity is demanded in further fine-tuning and\naffecting the cross-lingual abilities of LLMs. In this paper, we propose to\nempower Instructiontuned LLMs (It-LLMs) in languages other than English by\nbuilding semantic alignment between them. Hence, we propose CrossAlpaca, an\nIt-LLM with cross-lingual instruction-following and Translation-following\ndemonstrations to improve semantic alignment between languages. We validate our\napproach on the multilingual Question Answering (QA) benchmarks XQUAD and MLQA\nand adapted versions of MMLU and BBH. Our models, tested over six different\nlanguages, outperform the It-LLMs tuned on monolingual data. The final results\nshow that instruction tuning on non-English data is not enough and that\nsemantic alignment can be further improved by Translation-following\ndemonstrations.\n","authors":["Leonardo Ranaldi","Giulia Pucci","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2308.14186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14182v1","updated":"2023-08-27T19:03:12Z","published":"2023-08-27T19:03:12Z","title":"Generative AI for Business Strategy: Using Foundation Models to Create\n Business Strategy Tools","summary":" Generative models (foundation models) such as LLMs (large language models)\nare having a large impact on multiple fields. In this work, we propose the use\nof such models for business decision making. In particular, we combine\nunstructured textual data sources (e.g., news data) with multiple foundation\nmodels (namely, GPT4, transformer-based Named Entity Recognition (NER) models\nand Entailment-based Zero-shot Classifiers (ZSC)) to derive IT (information\ntechnology) artifacts in the form of a (sequence of) signed business networks.\nWe posit that such artifacts can inform business stakeholders about the state\nof the market and their own positioning as well as provide quantitative\ninsights into improving their future outlook.\n","authors":["Son The Nguyen","Theja Tulabandhula"],"pdf_url":"https://arxiv.org/pdf/2308.14182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14179v1","updated":"2023-08-27T18:46:47Z","published":"2023-08-27T18:46:47Z","title":"Towards Vision-Language Mechanistic Interpretability: A Causal Tracing\n Tool for BLIP","summary":" Mechanistic interpretability seeks to understand the neural mechanisms that\nenable specific behaviors in Large Language Models (LLMs) by leveraging\ncausality-based methods. While these approaches have identified neural circuits\nthat copy spans of text, capture factual knowledge, and more, they remain\nunusable for multimodal models since adapting these tools to the\nvision-language domain requires considerable architectural changes. In this\nwork, we adapt a unimodal causal tracing tool to BLIP to enable the study of\nthe neural mechanisms underlying image-conditioned text generation. We\ndemonstrate our approach on a visual question answering dataset, highlighting\nthe causal relevance of later layer representations for all tokens.\nFurthermore, we release our BLIP causal tracing tool as open source to enable\nfurther experimentation in vision-language mechanistic interpretability by the\ncommunity. Our code is available at\nhttps://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability.\n","authors":["Vedant Palit","Rohan Pandey","Aryaman Arora","Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2308.14179v1.pdf","comment":"Final version for 5th Workshop on Closing the Loop Between Vision and\n Language (CLVL) @ ICCV 2023. 4 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14149v1","updated":"2023-08-27T16:14:19Z","published":"2023-08-27T16:14:19Z","title":"Examining User-Friendly and Open-Sourced Large GPT Models: A Survey on\n Language, Multimodal, and Scientific GPT Models","summary":" Generative pre-trained transformer (GPT) models have revolutionized the field\nof natural language processing (NLP) with remarkable performance in various\ntasks and also extend their power to multimodal domains. Despite their success,\nlarge GPT models like GPT-4 face inherent limitations such as considerable\nsize, high computational requirements, complex deployment processes, and closed\ndevelopment loops. These constraints restrict their widespread adoption and\nraise concerns regarding their responsible development and usage. The need for\nuser-friendly, relatively small, and open-sourced alternative GPT models arises\nfrom the desire to overcome these limitations while retaining high performance.\nIn this survey paper, we provide an examination of alternative open-sourced\nmodels of large GPTs, focusing on user-friendly and relatively small models\nthat facilitate easier deployment and accessibility. Through this extensive\nsurvey, we aim to equip researchers, practitioners, and enthusiasts with a\nthorough understanding of user-friendly and relatively small open-sourced\nmodels of large GPTs, their current state, challenges, and future research\ndirections, inspiring the development of more efficient, accessible, and\nversatile GPT models that cater to the broader scientific community and advance\nthe field of general artificial intelligence. The source contents are\ncontinuously updating in https://github.com/GPT-Alternatives/gpt_alternatives.\n","authors":["Kaiyuan Gao","Sunan He","Zhenyu He","Jiacheng Lin","QiZhi Pei","Jie Shao","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14132v1","updated":"2023-08-27T15:20:06Z","published":"2023-08-27T15:20:06Z","title":"Detecting Language Model Attacks with Perplexity","summary":" A novel hack involving Large Language Models (LLMs) has emerged, leveraging\nadversarial suffixes to trick models into generating perilous responses. This\nmethod has garnered considerable attention from reputable media outlets such as\nthe New York Times and Wired, thereby influencing public perception regarding\nthe security and safety of LLMs. In this study, we advocate the utilization of\nperplexity as one of the means to recognize such potential attacks. The\nunderlying concept behind these hacks revolves around appending an unusually\nconstructed string of text to a harmful query that would otherwise be blocked.\nThis maneuver confuses the protective mechanisms and tricks the model into\ngenerating a forbidden response. Such scenarios could result in providing\ndetailed instructions to a malicious user for constructing explosives or\norchestrating a bank heist. Our investigation demonstrates the feasibility of\nemploying perplexity, a prevalent natural language processing metric, to detect\nthese adversarial tactics before generating a forbidden response. By evaluating\nthe perplexity of queries with and without such adversarial suffixes using an\nopen-source LLM, we discovered that nearly 90 percent were above a perplexity\nof 1000. This contrast underscores the efficacy of perplexity for detecting\nthis type of exploit.\n","authors":["Gabriel Alon","Michael Kamfonas"],"pdf_url":"https://arxiv.org/pdf/2308.14132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14120v1","updated":"2023-08-27T14:28:38Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap\nand perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT CI without specific guidance. ChatGPT CI autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT CI offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14115v1","updated":"2023-08-27T14:14:28Z","published":"2023-08-27T14:14:28Z","title":"Situated Natural Language Explanations","summary":" Natural language is among the most accessible tools for explaining decisions\nto humans, and large pretrained language models (PLMs) have demonstrated\nimpressive abilities to generate coherent natural language explanations (NLE).\nThe existing NLE research perspectives do not take the audience into account.\nAn NLE can have high textual quality, but it might not accommodate audiences'\nneeds and preference. To address this limitation, we propose an alternative\nperspective, situated NLE, including a situated generation framework and a\nsituated evaluation framework. On the generation side, we propose simple prompt\nengineering methods that adapt the NLEs to situations. In human studies, the\nannotators preferred the situated NLEs. On the evaluation side, we set up\nautomated evaluation scores in lexical, semantic, and pragmatic categories. The\nscores can be used to select the most suitable prompts to generate NLEs.\nSituated NLE provides a perspective to conduct further research on automatic\nNLE generations.\n","authors":["Zining Zhu","Haoming Jiang","Jingfeng Yang","Sreyashi Nag","Chao Zhang","Jie Huang","Yifan Gao","Frank Rudzicz","Bing Yin"],"pdf_url":"https://arxiv.org/pdf/2308.14115v1.pdf","comment":"A previous version was presented in ACL 2023 NLRSE workshop"},{"id":"http://arxiv.org/abs/2308.14089v1","updated":"2023-08-27T12:24:39Z","published":"2023-08-27T12:24:39Z","title":"MedAlign: A Clinician-Generated Dataset for Instruction Following with\n Electronic Medical Records","summary":" The ability of large language models (LLMs) to follow natural language\ninstructions with human-level fluency suggests many opportunities in healthcare\nto reduce administrative burden and improve quality of care. However,\nevaluating LLMs on realistic text generation tasks for healthcare remains\nchallenging. Existing question answering datasets for electronic health record\n(EHR) data fail to capture the complexity of information needs and\ndocumentation burdens experienced by clinicians. To address these challenges,\nwe introduce MedAlign, a benchmark dataset of 983 natural language instructions\nfor EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes\nclinician-written reference responses for 303 instructions, and provides 276\nlongitudinal EHRs for grounding instruction-response pairs. We used MedAlign to\nevaluate 6 general domain LLMs, having clinicians rank the accuracy and quality\nof each LLM response. We found high error rates, ranging from 35% (GPT-4) to\n68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k\ncontext lengths for GPT-4. Finally, we report correlations between clinician\nrankings and automated natural language generation metrics as a way to rank\nLLMs without human review. We make MedAlign available under a research data use\nagreement to enable LLM evaluations on tasks aligned with clinician needs and\npreferences.\n","authors":["Scott L. Fleming","Alejandro Lozano","William J. Haberkorn","Jenelle A. Jindal","Eduardo P. Reis","Rahul Thapa","Louis Blankemeier","Julian Z. Genkins","Ethan Steinberg","Ashwin Nayak","Birju S. Patel","Chia-Chun Chiang","Alison Callahan","Zepeng Huo","Sergios Gatidis","Scott J. Adams","Oluseyi Fayanju","Shreya J. Shah","Thomas Savage","Ethan Goh","Akshay S. Chaudhari","Nima Aghaeepour","Christopher Sharp","Michael A. Pfeffer","Percy Liang","Jonathan H. Chen","Keith E. Morse","Emma P. Brunskill","Jason A. Fries","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14077v1","updated":"2023-08-27T11:51:27Z","published":"2023-08-27T11:51:27Z","title":"An Analysis of On-the-fly Determinization of Finite-state Automata","summary":" In this paper we establish an abstraction of on-the-fly determinization of\nfinite-state automata using transition monoids and demonstrate how it can be\napplied to bound the asymptotics. We present algebraic and combinatorial\nproperties that are sufficient for a polynomial state complexity of the\ndeterministic automaton constructed on-the-fly. A special case of our findings\nis that automata with many non-deterministic transitions almost always admit a\ndeterminization of polynomial complexity. Furthermore, we extend our ideas to\nweighted finite-state automata.\n","authors":["Ivan Baburin","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2308.14077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v3","updated":"2023-08-27T11:21:38Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v3.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.14034v1","updated":"2023-08-27T07:53:00Z","published":"2023-08-27T07:53:00Z","title":"Confucius: Iterative Tool Learning from Introspection Feedback by\n Easy-to-Difficult Curriculum","summary":" Augmenting large language models (LLMs) with external tools has emerged as a\npromising approach to extending the capability of LLMs. Although some works\nemploy open-source LLMs for the tool learning task, most of them are trained in\na controlled environment in which LLMs only learn to execute the human-provided\ntools. However, selecting proper tools from the large toolset is also a crucial\nability for the tool learning model to be applied in real-world applications.\nExisting methods usually directly employ self-instruction methods to train the\nmodel, which ignores differences in tool complexity. In this paper, we propose\nthe Confucius, a novel tool learning framework to train LLM to use complicated\ntools in real-world scenarios, which contains two main phases: (1) We first\npropose a multi-stage learning method to teach the LLM to use various tools\nfrom an easy-to-difficult curriculum; (2) thenceforth, we propose the Iterative\nSelf-instruct from Introspective Feedback (ISIF) to dynamically construct the\ndataset to improve the ability to use the complicated tool. Extensive\nexperiments conducted on both controlled and real-world settings demonstrate\nthe superiority of our tool learning framework in the real-world application\nscenarios compared to both tuning-free (e.g. ChatGPT, Claude) and tuning-based\nbaselines (e.g. GPT4Tools).\n","authors":["Shen Gao","Zhengliang Shi","Minghang Zhu","Bowen Fang","Xin Xin","Pengjie Ren","Zhumin Chen","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2308.14034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06624v2","updated":"2023-08-27T02:55:36Z","published":"2023-06-11T08:53:12Z","title":"RestGPT: Connecting Large Language Models with Real-World RESTful APIs","summary":" Tool-augmented large language models (LLMs) have achieved remarkable progress\nin tackling a broad range of tasks. However, existing methods are mainly\nrestricted to specifically designed tools and fail to fulfill complex\ninstructions, having great limitations when confronted with real-world\nscenarios. In this paper, we explore a more realistic scenario by connecting\nLLMs with RESTful APIs, which adhere to the widely adopted REST software\narchitectural style for web service development. To address the practical\nchallenges of tackling complex instructions, we propose RestGPT, which exploits\nthe power of LLMs and conducts a coarse-to-fine online planning mechanism to\nenhance the abilities of task decomposition and API selection. RestGPT also\ncontains an API executor tailored for calling RESTful APIs, which can\nmeticulously formulate parameters and parse API responses. To fully evaluate\nthe performance of RestGPT, we propose RestBench, a high-quality benchmark\nwhich consists of two real-world scenarios and human-annotated instructions\nwith gold solution paths. Experiments show that RestGPT is able to achieve\nimpressive results in complex tasks and has strong robustness, which paves a\nnew way towards AGI. RestGPT and RestBench is publicly available at\nhttps://restgpt.github.io/.\n","authors":["Yifan Song","Weimin Xiong","Dawei Zhu","Wenhao Wu","Han Qian","Mingbo Song","Hailiang Huang","Cheng Li","Ke Wang","Rong Yao","Ye Tian","Sujian Li"],"pdf_url":"https://arxiv.org/pdf/2306.06624v2.pdf","comment":"Add RestBench to evaluate RestGPT"},{"id":"http://arxiv.org/abs/2209.07562v3","updated":"2023-08-27T02:42:16Z","published":"2022-09-15T19:01:21Z","title":"TwHIN-BERT: A Socially-Enriched Pre-trained Language Model for\n Multilingual Tweet Representations at Twitter","summary":" Pre-trained language models (PLMs) are fundamental for natural language\nprocessing applications. Most existing PLMs are not tailored to the noisy\nuser-generated text on social media, and the pre-training does not factor in\nthe valuable social engagement logs available in a social network. We present\nTwHIN-BERT, a multilingual language model productionized at Twitter, trained on\nin-domain data from the popular social network. TwHIN-BERT differs from prior\npre-trained language models as it is trained with not only text-based\nself-supervision, but also with a social objective based on the rich social\nengagements within a Twitter heterogeneous information network (TwHIN). Our\nmodel is trained on 7 billion tweets covering over 100 distinct languages,\nproviding a valuable representation to model short, noisy, user-generated text.\nWe evaluate our model on various multilingual social recommendation and\nsemantic understanding tasks and demonstrate significant metric improvement\nover established pre-trained language models. We open-source TwHIN-BERT and our\ncurated hashtag prediction and social engagement benchmark datasets to the\nresearch community.\n","authors":["Xinyang Zhang","Yury Malkov","Omar Florez","Serim Park","Brian McWilliams","Jiawei Han","Ahmed El-Kishky"],"pdf_url":"https://arxiv.org/pdf/2209.07562v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01488v3","updated":"2023-08-27T02:38:30Z","published":"2022-12-02T23:43:18Z","title":"Event knowledge in large language models: the gap between the impossible\n and the unlikely","summary":" Word co-occurrence patterns in language corpora contain a surprising amount\nof conceptual knowledge. Large language models (LLMs), trained to predict words\nin context, leverage these patterns to achieve impressive performance on\ndiverse semantic tasks requiring world knowledge. An important but understudied\nquestion about LLMs' semantic abilities is whether they acquire generalized\nknowledge of common events. Here, we test whether five pre-trained LLMs (from\n2018's BERT to 2023's MPT) assign higher likelihood to plausible descriptions\nof agent-patient interactions than to minimally different implausible versions\nof the same event. Using three curated sets of minimal sentence pairs (total\nn=1,215), we found that pre-trained LLMs possess substantial event knowledge,\noutperforming other distributional language models. In particular, they almost\nalways assign higher likelihood to possible vs. impossible events (The teacher\nbought the laptop vs. The laptop bought the teacher). However, LLMs show less\nconsistent preferences for likely vs. unlikely events (The nanny tutored the\nboy vs. The boy tutored the nanny). In follow-up analyses, we show that (i) LLM\nscores are driven by both plausibility and surface-level sentence features,\n(ii) LLM scores generalize well across syntactic variants (active vs. passive\nconstructions) but less well across semantic variants (synonymous sentences),\n(iii) some LLM errors mirror human judgment ambiguity, and (iv) sentence\nplausibility serves as an organizing dimension in internal LLM representations.\nOverall, our results show that important aspects of event knowledge naturally\nemerge from distributional linguistic patterns, but also highlight a gap\nbetween representations of possible/impossible and likely/unlikely events.\n","authors":["Carina Kauf","Anna A. Ivanova","Giulia Rambelli","Emmanuele Chersoni","Jingyuan Selena She","Zawad Chowdhury","Evelina Fedorenko","Alessandro Lenci"],"pdf_url":"https://arxiv.org/pdf/2212.01488v3.pdf","comment":"The two lead authors have contributed equally to this work"},{"id":"http://arxiv.org/abs/2302.03162v3","updated":"2023-08-27T00:13:09Z","published":"2023-02-06T23:42:03Z","title":"Protecting Language Generation Models via Invisible Watermarking","summary":" Language generation models have been an increasingly powerful enabler for\nmany applications. Many such models offer free or affordable API access, which\nmakes them potentially vulnerable to model extraction attacks through\ndistillation. To protect intellectual property (IP) and ensure fair use of\nthese models, various techniques such as lexical watermarking and synonym\nreplacement have been proposed. However, these methods can be nullified by\nobvious countermeasures such as \"synonym randomization\". To address this issue,\nwe propose GINSEW, a novel method to protect text generation models from being\nstolen through distillation. The key idea of our method is to inject secret\nsignals into the probability vector of the decoding steps for each target\ntoken. We can then detect the secret message by probing a suspect model to tell\nif it is distilled from the protected one. Experimental results show that\nGINSEW can effectively identify instances of IP infringement with minimal\nimpact on the generation quality of protected APIs. Our method demonstrates an\nabsolute improvement of 19 to 29 points on mean average precision (mAP) in\ndetecting suspects compared to previous methods against watermark removal\nattacks.\n","authors":["Xuandong Zhao","Yu-Xiang Wang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2302.03162v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.14763v1","updated":"2023-08-27T07:35:30Z","published":"2023-08-27T07:35:30Z","title":"VoiceBank-2023: A Multi-Speaker Mandarin Speech Corpus for Constructing\n Personalized TTS Systems for the Speech Impaired","summary":" Services of personalized TTS systems for the Mandarin-speaking speech\nimpaired are rarely mentioned. Taiwan started the VoiceBanking project in 2020,\naiming to build a complete set of services to deliver personalized Mandarin TTS\nsystems to amyotrophic lateral sclerosis patients. This paper reports the\ncorpus design, corpus recording, data purging and correction for the corpus,\nand evaluations of the developed personalized TTS systems, for the VoiceBanking\nproject. The developed corpus is named after the VoiceBank-2023 speech corpus\nbecause of its release year. The corpus contains 29.78 hours of utterances with\nprompts of short paragraphs and common phrases spoken by 111 native Mandarin\nspeakers. The corpus is labeled with information about gender, degree of speech\nimpairment, types of users, transcription, SNRs, and speaking rates. The\nVoiceBank-2023 is available by request for non-commercial use and welcomes all\nparties to join the VoiceBanking project to improve the services for the speech\nimpaired.\n","authors":["Jia-Jyu Su","Pang-Chen Liao","Yen-Ting Lin","Wu-Hao Li","Guan-Ting Liou","Cheng-Che Kao","Wei-Cheng Chen","Jen-Chieh Chiang","Wen-Yang Chang","Pin-Han Lin","Chen-Yu Chiang"],"pdf_url":"https://arxiv.org/pdf/2308.14763v1.pdf","comment":"submitted to 26th International Conference of the ORIENTAL-COCOSDA"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.14221v1","updated":"2023-08-27T22:45:24Z","published":"2023-08-27T22:45:24Z","title":"High-Resolution Document Shadow Removal via A Large-Scale Real-World\n Dataset and A Frequency-Aware Shadow Erasing Net","summary":" Shadows often occur when we capture the documents with casual equipment,\nwhich influences the visual quality and readability of the digital copies.\nDifferent from the algorithms for natural shadow removal, the algorithms in\ndocument shadow removal need to preserve the details of fonts and figures in\nhigh-resolution input. Previous works ignore this problem and remove the\nshadows via approximate attention and small datasets, which might not work in\nreal-world situations. We handle high-resolution document shadow removal\ndirectly via a larger-scale real-world dataset and a carefully designed\nfrequency-aware network. As for the dataset, we acquire over 7k couples of\nhigh-resolution (2462 x 3699) images of real-world document pairs with various\nsamples under different lighting circumstances, which is 10 times larger than\nexisting datasets. As for the design of the network, we decouple the\nhigh-resolution images in the frequency domain, where the low-frequency details\nand high-frequency boundaries can be effectively learned via the carefully\ndesigned network structure. Powered by our network and dataset, the proposed\nmethod clearly shows a better performance than previous methods in terms of\nvisual quality and numerical results. The code, models, and dataset are\navailable at: https://github.com/CXH-Research/DocShadow-SD7K\n","authors":["Zinuo Li","Xuhang Chen","Chi-Man Pun","Xiaodong Cun"],"pdf_url":"https://arxiv.org/pdf/2308.14221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14213v1","updated":"2023-08-27T22:07:42Z","published":"2023-08-27T22:07:42Z","title":"Post-Hoc Explainability of BI-RADS Descriptors in a Multi-task Framework\n for Breast Cancer Detection and Segmentation","summary":" Despite recent medical advancements, breast cancer remains one of the most\nprevalent and deadly diseases among women. Although machine learning-based\nComputer-Aided Diagnosis (CAD) systems have shown potential to assist\nradiologists in analyzing medical images, the opaque nature of the\nbest-performing CAD systems has raised concerns about their trustworthiness and\ninterpretability. This paper proposes MT-BI-RADS, a novel explainable deep\nlearning approach for tumor detection in Breast Ultrasound (BUS) images. The\napproach offers three levels of explanations to enable radiologists to\ncomprehend the decision-making process in predicting tumor malignancy. Firstly,\nthe proposed model outputs the BI-RADS categories used for BUS image analysis\nby radiologists. Secondly, the model employs multi-task learning to\nconcurrently segment regions in images that correspond to tumors. Thirdly, the\nproposed approach outputs quantified contributions of each BI-RADS descriptor\ntoward predicting the benign or malignant class using post-hoc explanations\nwith Shapley Values.\n","authors":["Mohammad Karimzadeh","Aleksandar Vakanski","Min Xian","Boyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14213v1.pdf","comment":"11 pages, 5 figures. Published at 2023 IEEE Workshop on MLSP"},{"id":"http://arxiv.org/abs/2308.05695v2","updated":"2023-08-27T22:05:35Z","published":"2023-08-10T16:57:14Z","title":"Masked Diffusion as Self-supervised Representation Learner","summary":" Denoising diffusion probabilistic models have recently demonstrated\nstate-of-the-art generative performance and been used as strong pixel-level\nrepresentation learners. This paper decomposes the interrelation between the\ngenerative capability and representation learning ability inherent in diffusion\nmodels. We present masked diffusion model (MDM), a scalable self-supervised\nrepresentation learner that substitutes the conventional additive Gaussian\nnoise of traditional diffusion with a masking mechanism. Our proposed approach\nconvincingly surpasses prior benchmarks, demonstrating remarkable advancements\nin both medical and natural image semantic segmentation tasks, particularly\nwithin the context of few-shot scenario.\n","authors":["Zixuan Pan","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2308.05695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14212v1","updated":"2023-08-27T22:02:41Z","published":"2023-08-27T22:02:41Z","title":"Exploring the Transfer Learning Capabilities of CLIP in Domain\n Generalization for Diabetic Retinopathy","summary":" Diabetic Retinopathy (DR), a leading cause of vision impairment, requires\nearly detection and treatment. Developing robust AI models for DR\nclassification holds substantial potential, but a key challenge is ensuring\ntheir generalization in unfamiliar domains with varying data distributions. To\naddress this, our paper investigates cross-domain generalization, also known as\ndomain generalization (DG), within the context of DR classification. DG, a\nchallenging problem in the medical domain, is complicated by the difficulty of\ngathering labeled data across different domains, such as patient demographics\nand disease stages. Some recent studies have shown the effectiveness of using\nCLIP to handle the DG problem in natural images. In this study, we investigate\nCLIP's transfer learning capabilities and its potential for cross-domain\ngeneralization in diabetic retinopathy (DR) classification. We carry out\ncomprehensive experiments to assess the efficacy and potential of CLIP in\naddressing DG for DR classification. Further, we introduce a multi-modal\nfine-tuning strategy named Context Optimization with Learnable Visual Tokens\n(CoOpLVT), which enhances context optimization by conditioning on visual\nfeatures. Our findings demonstrate that the proposed method increases the\nF1-score by 1.8% over the baseline, thus underlining its promise for effective\nDG in DR classification. Our code is publicly available at\nhttps://github.com/Sanoojan/CLIP-DRDG.\n","authors":["Sanoojan Baliah","Fadillah A. Maani","Santosh Sanjeev","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2308.14212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00646v2","updated":"2023-08-27T20:24:37Z","published":"2022-11-01T00:40:09Z","title":"Learning Melanocytic Cell Masks from Adjacent Stained Tissue","summary":" Melanoma is one of the most aggressive forms of skin cancer, causing a large\nproportion of skin cancer deaths. However, melanoma diagnoses by pathologists\nshows low interrater reliability. As melanoma is a cancer of the melanocyte,\nthere is a clear need to develop a melanocytic cell segmentation tool that is\nagnostic to pathologist variability and automates pixel-level annotation.\nGigapixel-level pathologist labeling, however, is impractical. Herein, we\npropose a means to train deep neural networks for melanocytic cell segmentation\nfrom hematoxylin and eosin (H&E) stained slides using paired\nimmunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean\nIOU of 0.64 despite imperfect ground-truth labels.\n","authors":["Mikio Tada","Maria L. Wei","Michael J. Keiser"],"pdf_url":"https://arxiv.org/pdf/2211.00646v2.pdf","comment":"{Medical Image Learning with Limited & Noisy Data Workshop at MICCAI\n 2022"},{"id":"http://arxiv.org/abs/2308.14191v1","updated":"2023-08-27T19:44:44Z","published":"2023-08-27T19:44:44Z","title":"SketchDreamer: Interactive Text-Augmented Creative Sketch Ideation","summary":" Artificial Intelligence Generated Content (AIGC) has shown remarkable\nprogress in generating realistic images. However, in this paper, we take a step\n\"backward\" and address AIGC for the most rudimentary visual modality of human\nsketches. Our objective is on the creative nature of sketches, and that\ncreative sketching should take the form of an interactive process. We further\nenable text to drive the sketch ideation process, allowing creativity to be\nfreely defined, while simultaneously tackling the challenge of \"I can't\nsketch\". We present a method to generate controlled sketches using a\ntext-conditioned diffusion model trained on pixel representations of images.\nOur proposed approach, referred to as SketchDreamer, integrates a\ndifferentiable rasteriser of Bezier curves that optimises an initial input to\ndistil abstract semantic knowledge from a pretrained diffusion model. We\nutilise Score Distillation Sampling to learn a sketch that aligns with a given\ncaption, which importantly enable both text and sketch to interact with the\nideation process. Our objective is to empower non-professional users to create\nsketches and, through a series of optimisation processes, transform a narrative\ninto a storyboard by expanding the text prompt while making minor adjustments\nto the sketch input. Through this work, we hope to aspire the way we create\nvisual content, democratise the creative process, and inspire further research\nin enhancing human creativity in AIGC. The code is available at\n\\url{https://github.com/WinKawaks/SketchDreamer}.\n","authors":["Zhiyu Qu","Tao Xiang","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2308.14191v1.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2308.14190v1","updated":"2023-08-27T19:43:43Z","published":"2023-08-27T19:43:43Z","title":"Score-Based Generative Models for PET Image Reconstruction","summary":" Score-based generative models have demonstrated highly promising results for\nmedical image reconstruction tasks in magnetic resonance imaging or computed\ntomography. However, their application to Positron Emission Tomography (PET) is\nstill largely unexplored. PET image reconstruction involves a variety of\nchallenges, including Poisson noise with high variance and a wide dynamic\nrange. To address these challenges, we propose several PET-specific adaptations\nof score-based generative models. The proposed framework is developed for both\n2D and 3D PET. In addition, we provide an extension to guided reconstruction\nusing magnetic resonance images. We validate the approach through extensive 2D\nand 3D $\\textit{in-silico}$ experiments with a model trained on\npatient-realistic data without lesions, and evaluate on data without lesions as\nwell as out-of-distribution data with lesions. This demonstrates the proposed\nmethod's robustness and significant potential for improved PET reconstruction.\n","authors":["Imraj RD Singh","Alexander Denker","Riccardo Barbano","Željko Kereta","Bangti Jin","Kris Thielemans","Peter Maass","Simon Arridge"],"pdf_url":"https://arxiv.org/pdf/2308.14190v1.pdf","comment":"35 pages, 16 figures, submitted to Journal of Machine Learning for\n Biomedical Imaging (MELBA)"},{"id":"http://arxiv.org/abs/2212.02053v3","updated":"2023-08-27T19:41:53Z","published":"2022-12-05T06:14:23Z","title":"Day2Dark: Pseudo-Supervised Activity Recognition beyond Silent Daylight","summary":" This paper strives to recognize activities in the dark, as well as in the\nday. We first establish that state-of-the-art activity recognizers are\neffective during the day, but not trustworthy in the dark. The main causes are\nthe limited availability of labeled dark videos to learn from, as well as the\ndistribution shift towards the lower color contrast at test-time. To compensate\nfor the lack of labeled dark videos, we introduce a pseudo-supervised learning\nscheme, which utilizes easy to obtain unlabeled and task-irrelevant dark videos\nto improve an activity recognizer in low light. As the lower color contrast\nresults in visual information loss, we further propose to incorporate the\ncomplementary activity information within audio, which is invariant to\nillumination. Since the usefulness of audio and visual features differs\ndepending on the amount of illumination, we introduce our `darkness-adaptive'\naudio-visual recognizer. Experiments on EPIC-Kitchens, Kinetics-Sound, and\nCharades demonstrate our proposals are superior to image enhancement, domain\nadaptation and alternative audio-visual fusion methods, and can even improve\nrobustness to local darkness caused by occlusions. Project page:\nhttps://xiaobai1217.github.io/Day2Dark/\n","authors":["Yunhua Zhang","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2212.02053v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.14179v1","updated":"2023-08-27T18:46:47Z","published":"2023-08-27T18:46:47Z","title":"Towards Vision-Language Mechanistic Interpretability: A Causal Tracing\n Tool for BLIP","summary":" Mechanistic interpretability seeks to understand the neural mechanisms that\nenable specific behaviors in Large Language Models (LLMs) by leveraging\ncausality-based methods. While these approaches have identified neural circuits\nthat copy spans of text, capture factual knowledge, and more, they remain\nunusable for multimodal models since adapting these tools to the\nvision-language domain requires considerable architectural changes. In this\nwork, we adapt a unimodal causal tracing tool to BLIP to enable the study of\nthe neural mechanisms underlying image-conditioned text generation. We\ndemonstrate our approach on a visual question answering dataset, highlighting\nthe causal relevance of later layer representations for all tokens.\nFurthermore, we release our BLIP causal tracing tool as open source to enable\nfurther experimentation in vision-language mechanistic interpretability by the\ncommunity. Our code is available at\nhttps://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability.\n","authors":["Vedant Palit","Rohan Pandey","Aryaman Arora","Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2308.14179v1.pdf","comment":"Final version for 5th Workshop on Closing the Loop Between Vision and\n Language (CLVL) @ ICCV 2023. 4 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14177v1","updated":"2023-08-27T18:38:57Z","published":"2023-08-27T18:38:57Z","title":"AIGC for Various Data Modalities: A Survey","summary":" AI-generated content (AIGC) methods aim to produce text, images, videos, 3D\nassets, and other media using AI algorithms. Due to its wide range of\napplications and the demonstrated potential of recent works, AIGC developments\nhave been attracting a lot of attention recently, and AIGC methods have been\ndeveloped for various data modalities, such as image, video, text, 3D shape (as\nvoxels, point clouds, meshes, and neural implicit fields), 3D scene, 3D human\navatar (body and head), 3D motion, and audio -- each presenting different\ncharacteristics and challenges. Furthermore, there have also been many\nsignificant developments in cross-modality AIGC methods, where generative\nmethods can receive conditioning input in one modality and produce outputs in\nanother. Examples include going from various modalities to image, video, 3D\nshape, 3D scene, 3D avatar (body and head), 3D motion (skeleton and avatar),\nand audio modalities. In this paper, we provide a comprehensive review of AIGC\nmethods across different data modalities, including both single-modal and\ncross-modality methods, highlighting the various challenges, representative\nworks, and recent technical directions in each setting. We also present\ncomparative results on several benchmark datasets in various modalities.\nMoreover, we also discuss the challenges and potential future research\ndirections.\n","authors":["Lin Geng Foo","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08244v2","updated":"2023-08-27T18:23:01Z","published":"2022-11-14T03:51:12Z","title":"Artificial Intelligence for Automatic Detection and Classification\n Disease on the X-Ray Images","summary":" Detecting and classifying diseases using X-ray images is one of the more\nchallenging core tasks in the medical and research world. Due to the recent\nhigh interest in radiological images and AI, early detection of diseases in\nX-ray images has become notably more essential to prevent further spreading and\nflatten the curve. Innovations and revolutions of Computer Vision with Deep\nlearning methods offer great promise for fast and accurate diagnosis of\nscreening and detection from chest X-ray images (CXR). This work presents rapid\ndetection of diseases in the lung using the efficient Deep learning pre-trained\nRepVGG algorithm for deep feature extraction and classification. We used X-ray\nimages as an example to show the model's efficiency. To perform this task, we\nclassify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ\nROI object to improve the detection accuracy for lung extraction, followed by\ndata pre-processing and augmentation. We are applying Artificial Intelligence\ntechnology for automatic highlighted detection of affected areas of people's\nlungs. Based on the X-Ray images, an algorithm was developed that classifies\nX-Ray images with height accuracy and power faster thanks to the architecture\ntransformation of the model. We compared deep learning frameworks' accuracy and\ndetection of disease. The study shows the high power of deep learning methods\nfor X-ray images based on COVID-19 detection utilizing chest X-rays. The\nproposed framework offers better diagnostic accuracy by comparing popular deep\nlearning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and\nInceptionResnetV2.\n","authors":["Liora Mayats-Alpay"],"pdf_url":"https://arxiv.org/pdf/2211.08244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.00785v5","updated":"2023-08-27T18:01:10Z","published":"2022-01-03T18:05:52Z","title":"Implicit Autoencoder for Point-Cloud Self-Supervised Representation\n Learning","summary":" This paper advocates the use of implicit surface representation in\nautoencoder-based self-supervised 3D representation learning. The most popular\nand accessible 3D representation, i.e., point clouds, involves discrete samples\nof the underlying continuous 3D surface. This discretization process introduces\nsampling variations on the 3D shape, making it challenging to develop\ntransferable knowledge of the true 3D geometry. In the standard autoencoding\nparadigm, the encoder is compelled to encode not only the 3D geometry but also\ninformation on the specific discrete sampling of the 3D shape into the latent\ncode. This is because the point cloud reconstructed by the decoder is\nconsidered unacceptable unless there is a perfect mapping between the original\nand the reconstructed point clouds. This paper introduces the Implicit\nAutoEncoder (IAE), a simple yet effective method that addresses the sampling\nvariation issue by replacing the commonly-used point-cloud decoder with an\nimplicit decoder. The implicit decoder reconstructs a continuous representation\nof the 3D shape, independent of the imperfections in the discrete samples.\nExtensive experiments demonstrate that the proposed IAE achieves\nstate-of-the-art performance across various self-supervised learning\nbenchmarks.\n","authors":["Siming Yan","Zhenpei Yang","Haoxiang Li","Chen Song","Li Guan","Hao Kang","Gang Hua","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2201.00785v5.pdf","comment":"Published in ICCV 2023. The code is available at\n https://github.com/SimingYan/IAE"},{"id":"http://arxiv.org/abs/2308.14161v1","updated":"2023-08-27T17:44:25Z","published":"2023-08-27T17:44:25Z","title":"Intergrated Segmentation and Detection Models for Dentex Challenge 2023","summary":" Dental panoramic x-rays are commonly used in dental diagnosing. With the\ndevelopment of deep learning, auto detection of diseases from dental panoramic\nx-rays can help dentists to diagnose diseases more efficiently.The Dentex\nChallenge 2023 is a competition for automatic detection of abnormal teeth along\nwith their enumeration ids from dental panoramic x-rays. In this paper, we\npropose a method integrating segmentation and detection models to detect\nabnormal teeth as well as obtain their enumeration ids.Our codes are available\nat https://github.com/xyzlancehe/DentexSegAndDet.\n","authors":["Lanshan He","Yusheng Liu","Lisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14160v1","updated":"2023-08-27T17:30:56Z","published":"2023-08-27T17:30:56Z","title":"A Unified Transformer-based Network for multimodal Emotion Recognition","summary":" The development of transformer-based models has resulted in significant\nadvances in addressing various vision and NLP-based research challenges.\nHowever, the progress made in transformer-based methods has not been\neffectively applied to biosensing research. This paper presents a novel Unified\nBiosensor-Vision Multi-modal Transformer-based (UBVMT) method to classify\nemotions in an arousal-valence space by combining a 2D representation of an\nECG/PPG signal with the face information. To achieve this goal, we first\ninvestigate and compare the unimodal emotion recognition performance of three\nimage-based representations of the ECG/PPG signal. We then present our UBVMT\nnetwork which is trained to perform emotion recognition by combining the 2D\nimage-based representation of the ECG/PPG signal and the facial expression\nfeatures. Our unified transformer model consists of homogeneous transformer\nblocks that take as an input the 2D representation of the ECG/PPG signal and\nthe corresponding face frame for emotion representation learning with minimal\nmodality-specific design. Our UBVMT model is trained by reconstructing masked\npatches of video frames and 2D images of ECG/PPG signals, and contrastive\nmodeling to align face and ECG/PPG data. Extensive experiments on the\nMAHNOB-HCI and DEAP datasets show that our Unified UBVMT-based model produces\ncomparable results to the state-of-the-art techniques.\n","authors":["Kamran Ali","Charles E. Hughes"],"pdf_url":"https://arxiv.org/pdf/2308.14160v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2204.05905v2","updated":"2023-08-27T16:55:27Z","published":"2022-04-12T16:05:10Z","title":"Few-shot Forgery Detection via Guided Adversarial Interpolation","summary":" The increase in face manipulation models has led to a critical issue in\nsociety - the synthesis of realistic visual media. With the emergence of new\nforgery approaches at an unprecedented rate, existing forgery detection methods\nsuffer from significant performance drops when applied to unseen novel forgery\napproaches. In this work, we address the few-shot forgery detection problem by\n1) designing a comprehensive benchmark based on coverage analysis among various\nforgery approaches, and 2) proposing Guided Adversarial Interpolation (GAI).\nOur key insight is that there exist transferable distribution characteristics\nbetween majority and minority forgery classes1. Specifically, we enhance the\ndiscriminative ability against novel forgery approaches via adversarially\ninterpolating the forgery artifacts of the minority samples to the majority\nsamples under the guidance of a teacher network. Unlike the standard\nre-balancing method which usually results in over-fitting to minority classes,\nour method simultaneously takes account of the diversity of majority\ninformation as well as the significance of minority information. Extensive\nexperiments demonstrate that our GAI achieves state-of-the-art performances on\nthe established few-shot forgery detection benchmark. Notably, our method is\nalso validated to be robust to choices of majority and minority forgery\napproaches. The formal publication version is available in Pattern Recognition.\n","authors":["Haonan Qiu","Siyu Chen","Bei Gan","Kun Wang","Huafeng Shi","Jing Shao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2204.05905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12856v3","updated":"2023-08-27T16:46:38Z","published":"2022-08-26T20:08:40Z","title":"Local Context-Aware Active Domain Adaptation","summary":" Active Domain Adaptation (ADA) queries the labels of a small number of\nselected target samples to help adapting a model from a source domain to a\ntarget domain. The local context of queried data is important, especially when\nthe domain gap is large. However, this has not been fully explored by existing\nADA works. In this paper, we propose a Local context-aware ADA framework, named\nLADA, to address this issue. To select informative target samples, we devise a\nnovel criterion based on the local inconsistency of model predictions. Since\nthe labeling budget is usually small, fine-tuning model on only queried data\ncan be inefficient. We progressively augment labeled target data with the\nconfident neighbors in a class-balanced manner. Experiments validate that the\nproposed criterion chooses more informative target samples than existing active\nselection strategies. Furthermore, our full method clearly surpasses recent ADA\narts on various benchmarks. Code is available at https://github.com/tsun/LADA.\n","authors":["Tao Sun","Cheng Lu","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2208.12856v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14153v1","updated":"2023-08-27T16:33:11Z","published":"2023-08-27T16:33:11Z","title":"Sparse Sampling Transformer with Uncertainty-Driven Ranking for Unified\n Removal of Raindrops and Rain Streaks","summary":" In the real world, image degradations caused by rain often exhibit a\ncombination of rain streaks and raindrops, thereby increasing the challenges of\nrecovering the underlying clean image. Note that the rain streaks and raindrops\nhave diverse shapes, sizes, and locations in the captured image, and thus\nmodeling the correlation relationship between irregular degradations caused by\nrain artifacts is a necessary prerequisite for image deraining. This paper aims\nto present an efficient and flexible mechanism to learn and model degradation\nrelationships in a global view, thereby achieving a unified removal of\nintricate rain scenes. To do so, we propose a Sparse Sampling Transformer based\non Uncertainty-Driven Ranking, dubbed UDR-S2Former. Compared to previous\nmethods, our UDR-S2Former has three merits. First, it can adaptively sample\nrelevant image degradation information to model underlying degradation\nrelationships. Second, explicit application of the uncertainty-driven ranking\nstrategy can facilitate the network to attend to degradation features and\nunderstand the reconstruction process. Finally, experimental results show that\nour UDR-S2Former clearly outperforms state-of-the-art methods for all\nbenchmarks.\n","authors":["Sixiang Chen","Tian Ye","Jinbin Bai","Erkang Chen","Jun Shi","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14153v1.pdf","comment":"Accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2308.14152v1","updated":"2023-08-27T16:22:09Z","published":"2023-08-27T16:22:09Z","title":"Unaligned 2D to 3D Translation with Conditional Vector-Quantized Code\n Diffusion using Transformers","summary":" Generating 3D images of complex objects conditionally from a few 2D views is\na difficult synthesis problem, compounded by issues such as domain gap and\ngeometric misalignment. For instance, a unified framework such as Generative\nAdversarial Networks cannot achieve this unless they explicitly define both a\ndomain-invariant and geometric-invariant joint latent distribution, whereas\nNeural Radiance Fields are generally unable to handle both issues as they\noptimize at the pixel level. By contrast, we propose a simple and novel 2D to\n3D synthesis approach based on conditional diffusion with vector-quantized\ncodes. Operating in an information-rich code space enables high-resolution 3D\nsynthesis via full-coverage attention across the views. Specifically, we\ngenerate the 3D codes (e.g. for CT images) conditional on previously generated\n3D codes and the entire codebook of two 2D views (e.g. 2D X-rays). Qualitative\nand quantitative results demonstrate state-of-the-art performance over\nspecialized methods across varied evaluation criteria, including fidelity\nmetrics such as density, coverage, and distortion metrics for two complex\nvolumetric imagery datasets from in real-world scenarios.\n","authors":["Abril Corona-Figueroa","Sam Bond-Taylor","Neelanjan Bhowmik","Yona Falinie A. Gaus","Toby P. Breckon","Hubert P. H. Shum","Chris G. Willcocks"],"pdf_url":"https://arxiv.org/pdf/2308.14152v1.pdf","comment":"Camera-ready version for ICCV 2023"},{"id":"http://arxiv.org/abs/2303.09551v2","updated":"2023-08-27T15:33:19Z","published":"2023-03-16T17:59:08Z","title":"SurroundOcc: Multi-Camera 3D Occupancy Prediction for Autonomous Driving","summary":" 3D scene understanding plays a vital role in vision-based autonomous driving.\nWhile most existing methods focus on 3D object detection, they have difficulty\ndescribing real-world objects of arbitrary shapes and infinite classes. Towards\na more comprehensive perception of a 3D scene, in this paper, we propose a\nSurroundOcc method to predict the 3D occupancy with multi-camera images. We\nfirst extract multi-scale features for each image and adopt spatial 2D-3D\nattention to lift them to the 3D volume space. Then we apply 3D convolutions to\nprogressively upsample the volume features and impose supervision on multiple\nlevels. To obtain dense occupancy prediction, we design a pipeline to generate\ndense occupancy ground truth without expansive occupancy annotations.\nSpecifically, we fuse multi-frame LiDAR scans of dynamic objects and static\nscenes separately. Then we adopt Poisson Reconstruction to fill the holes and\nvoxelize the mesh to get dense occupancy labels. Extensive experiments on\nnuScenes and SemanticKITTI datasets demonstrate the superiority of our method.\nCode and dataset are available at https://github.com/weiyithu/SurroundOcc\n","authors":["Yi Wei","Linqing Zhao","Wenzhao Zheng","Zheng Zhu","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2303.09551v2.pdf","comment":"Accepted to ICCV 2023. Code is available at\n https://github.com/weiyithu/SurroundOcc"},{"id":"http://arxiv.org/abs/2308.14133v1","updated":"2023-08-27T15:21:25Z","published":"2023-08-27T15:21:25Z","title":"Cheap Lunch for Medical Image Segmentation by Fine-tuning SAM on Few\n Exemplars","summary":" The Segment Anything Model (SAM) has demonstrated remarkable capabilities of\nscaled-up segmentation models, enabling zero-shot generalization across a\nvariety of domains. By leveraging large-scale foundational models as\npre-trained models, it is a natural progression to fine-tune SAM for specific\ndomains to further enhance performances. However, the adoption of foundational\nmodels in the medical domain presents a challenge due to the difficulty and\nexpense of labeling sufficient data for adaptation within hospital systems. In\nthis paper, we introduce an efficient and practical approach for fine-tuning\nSAM using a limited number of exemplars, making it suitable for such scenarios.\nOur approach combines two established techniques from the literature: an\nexemplar-guided synthesis module and the widely recognized Low-Rank Adaptation\n(LoRA) fine-tuning strategy, serving as data-level and model-level attempts\nrespectively. Interestingly, our empirical findings suggest that SAM can be\neffectively aligned within the medical domain even with few labeled data. We\nvalidate our approach through experiments on brain tumor segmentation (BraTS)\nand multi-organ CT segmentation (Synapse). The comprehensive results underscore\nthe feasibility and effectiveness of such an approach, paving the way for the\npractical application of SAM in the medical domain.\n","authors":["Weijia Feng","Lingting Zhu","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14133v1.pdf","comment":"Accepted by Brain Lesion (BrainLes) workshop of International\n Conference on Medical Image Computing and Computer Assisted Intervention\n (MICCAI BrainLes 2023). 10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.14126v1","updated":"2023-08-27T15:03:10Z","published":"2023-08-27T15:03:10Z","title":"Synergizing Contrastive Learning and Optimal Transport for 3D Point\n Cloud Domain Adaptation","summary":" Recently, the fundamental problem of unsupervised domain adaptation (UDA) on\n3D point clouds has been motivated by a wide variety of applications in\nrobotics, virtual reality, and scene understanding, to name a few. The point\ncloud data acquisition procedures manifest themselves as significant domain\ndiscrepancies and geometric variations among both similar and dissimilar\nclasses. The standard domain adaptation methods developed for images do not\ndirectly translate to point cloud data because of their complex geometric\nnature. To address this challenge, we leverage the idea of multimodality and\nalignment between distributions. We propose a new UDA architecture for point\ncloud classification that benefits from multimodal contrastive learning to get\nbetter class separation in both domains individually. Further, the use of\noptimal transport (OT) aims at learning source and target data distributions\njointly to reduce the cross-domain shift and provide a better alignment. We\nconduct a comprehensive empirical study on PointDA-10 and GraspNetPC-10 and\nshow that our method achieves state-of-the-art performance on GraspNetPC-10\n(with approx 4-12% margin) and best average performance on PointDA-10. Our\nablation studies and decision boundary analysis also validate the significance\nof our contrastive learning module and OT alignment.\n","authors":["Siddharth Katageri","Arkadipta De","Chaitanya Devaguptapu","VSSV Prasad","Charu Sharma","Manohar Kaul"],"pdf_url":"https://arxiv.org/pdf/2308.14126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14119v1","updated":"2023-08-27T14:25:07Z","published":"2023-08-27T14:25:07Z","title":"Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario","summary":" Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to\nimprove model performance. Traditional SSL methods assume that labeled and\nunlabeled data share the same label space. However, in real-world applications,\nespecially when the labeled training set is small, there may be classes that\nare missing from the labeled set. Existing frameworks aim to either reject all\nunseen classes (open-set SSL) or to discover unseen classes by partitioning an\nunlabeled set during training (open-world SSL). In our work, we construct a\nclassifier for points from both seen and unseen classes. Our approach is based\non extending an existing SSL method, such as FlexMatch, by incorporating an\nadditional entropy loss. This enhancement allows our method to improve the\nperformance of any existing SSL method in the classification of both seen and\nunseen classes. We demonstrate large improvement gains over state-of-the-art\nSSL, open-set SSL, and open-world SSL methods, on two benchmark image\nclassification data sets, CIFAR-100 and STL-10. The gains are most pronounced\nwhen the labeled data is severely limited (1-25 labeled examples per class).\n","authors":["Noam Fluss","Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13608v2","updated":"2023-08-27T14:11:34Z","published":"2023-05-23T02:16:14Z","title":"VDD: Varied Drone Dataset for Semantic Segmentation","summary":" Semantic segmentation of drone images is critical to many aerial vision tasks\nas it provides essential semantic details that can compensate for the lack of\ndepth information from monocular cameras. However, maintaining high accuracy of\nsemantic segmentation models for drones requires diverse, large-scale, and\nhigh-resolution datasets, which are rare in the field of aerial image\nprocessing. Existing datasets are typically small and focus primarily on urban\nscenes, neglecting rural and industrial areas. Models trained on such datasets\nare not sufficiently equipped to handle the variety of inputs seen in drone\nimagery. In the VDD-Varied Drone Dataset, we offer a large-scale and densely\nlabeled dataset comprising 400 high-resolution images that feature carefully\nchosen scenes, camera angles, and varied light and weather conditions.\nFurthermore, we have adapted existing drone datasets to conform to our\nannotation standards and integrated them with VDD to create a dataset 1.5 times\nthe size of fine annotation of Cityscapes. We have developed a novel DeepLabT\nmodel, which combines CNN and Transformer backbones, to provide a reliable\nbaseline for semantic segmentation in drone imagery. Our experiments indicate\nthat DeepLabT performs admirably on VDD and other drone datasets. We expect\nthat our dataset will generate considerable interest in drone image\nsegmentation and serve as a foundation for other drone vision tasks. VDD is\nfreely available on our website at https://vddvdd.com .\n","authors":["Wenxiao Cai","Ke Jin","Jinyan Hou","Cong Guo","Letian Wu","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2305.13608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14113v1","updated":"2023-08-27T14:07:57Z","published":"2023-08-27T14:07:57Z","title":"Semantic-aware Consistency Network for Cloth-changing Person\n Re-Identification","summary":" Cloth-changing Person Re-Identification (CC-ReID) is a challenging task that\naims to retrieve the target person across multiple surveillance cameras when\nclothing changes might happen. Despite recent progress in CC-ReID, existing\napproaches are still hindered by the interference of clothing variations since\nthey lack effective constraints to keep the model consistently focused on\nclothing-irrelevant regions. To address this issue, we present a Semantic-aware\nConsistency Network (SCNet) to learn identity-related semantic features by\nproposing effective consistency constraints. Specifically, we generate the\nblack-clothing image by erasing pixels in the clothing area, which explicitly\nmitigates the interference from clothing variations. In addition, to fully\nexploit the fine-grained identity information, a head-enhanced attention module\nis introduced, which learns soft attention maps by utilizing the proposed\npart-based matching loss to highlight head information. We further design a\nsemantic consistency loss to facilitate the learning of high-level\nidentity-related semantic features, forcing the model to focus on semantically\nconsistent cloth-irrelevant regions. By using the consistency constraint, our\nmodel does not require any extra auxiliary segmentation module to generate the\nblack-clothing image or locate the head region during the inference stage.\nExtensive experiments on four cloth-changing person Re-ID datasets (LTCC, PRCC,\nVc-Clothes, and DeepChange) demonstrate that our proposed SCNet makes\nsignificant improvements over prior state-of-the-art approaches. Our code is\navailable at: https://github.com/Gpn-star/SCNet.\n","authors":["Peini Guo","Hong Liu","Jianbing Wu","Guoquan Wang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14113v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2304.09807v2","updated":"2023-08-27T13:58:18Z","published":"2023-04-19T16:47:20Z","title":"VMA: Divide-and-Conquer Vectorized Map Annotation System for Large-Scale\n Driving Scene","summary":" High-definition (HD) map serves as the essential infrastructure of autonomous\ndriving. In this work, we build up a systematic vectorized map annotation\nframework (termed VMA) for efficiently generating HD map of large-scale driving\nscene. We design a divide-and-conquer annotation scheme to solve the spatial\nextensibility problem of HD map generation, and abstract map elements with a\nvariety of geometric patterns as unified point sequence representation, which\ncan be extended to most map elements in the driving scene. VMA is highly\nefficient and extensible, requiring negligible human effort, and flexible in\nterms of spatial scale and element type. We quantitatively and qualitatively\nvalidate the annotation performance on real-world urban and highway scenes, as\nwell as NYC Planimetric Database. VMA can significantly improve map generation\nefficiency and require little human effort. On average VMA takes 160min for\nannotating a scene with a range of hundreds of meters, and reduces 52.3% of the\nhuman cost, showing great application value. Code:\nhttps://github.com/hustvl/VMA.\n","authors":["Shaoyu Chen","Yunchi Zhang","Bencheng Liao","Jiafeng Xie","Tianheng Cheng","Wei Sui","Qian Zhang","Chang Huang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09807v2.pdf","comment":"https://github.com/hustvl/VMA"},{"id":"http://arxiv.org/abs/2308.14108v1","updated":"2023-08-27T13:50:15Z","published":"2023-08-27T13:50:15Z","title":"Depth self-supervision for single image novel view synthesis","summary":" In this paper, we tackle the problem of generating a novel image from an\narbitrary viewpoint given a single frame as input. While existing methods\noperating in this setup aim at predicting the target view depth map to guide\nthe synthesis, without explicit supervision over such a task, we jointly\noptimize our framework for both novel view synthesis and depth estimation to\nunleash the synergy between the two at its best. Specifically, a shared depth\ndecoder is trained in a self-supervised manner to predict depth maps that are\nconsistent across the source and target views. Our results demonstrate the\neffectiveness of our approach in addressing the challenges of both tasks\nallowing for higher-quality generated images, as well as more accurate depth\nfor the target viewpoint.\n","authors":["Giovanni Minelli","Matteo Poggi","Samuele Salti"],"pdf_url":"https://arxiv.org/pdf/2308.14108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1912.03623v3","updated":"2023-08-27T13:50:10Z","published":"2019-12-08T06:10:49Z","title":"Single image reflection removal via learning with multi-image\n constraints","summary":" Reflections are very common phenomena in our daily photography, which\ndistract people's attention from the scene behind the glass. The problem of\nremoving reflection artifacts is important but challenging due to its ill-posed\nnature. The traditional approaches solve an optimization problem over the\nconstraints induced from multiple images, at the expense of large computation\ncosts. Recent learning-based approaches have demonstrated a significant\nimprovement in both performance and running time for single image reflection\nremoval, but are limited as they require a large number of synthetic\nreflection/clean image pairs for direct supervision to approximate the ground\ntruth, at the risk of overfitting in the synthetic image domain and degrading\nin the real image domain. In this paper, we propose a novel learning-based\nsolution that combines the advantages of the aforementioned approaches and\novercomes their drawbacks. Our algorithm works by learning a deep neural\nnetwork to optimize the target with joint constraints enhanced among multiple\ninput images during the training phase, but is able to eliminate reflections\nonly from a single input for evaluation. Our algorithm runs in real-time and\nachieves state-of-the-art reflection removal performance on real images. We\nfurther propose a strong network backbone that disentangles the background and\nreflection information into separate latent codes, which are embedded into a\nshared one-branch deep neural network for both background and reflection\npredictions. The proposed backbone experimentally performs better than the\nother common network implementations, and provides insightful knowledge to\nunderstand the reflection removal task.\n","authors":["Yingda Yin","Qingnan Fan","Dongdong Chen","Yujie Wang","Angelica Aviles-Rivero","Ruoteng Li","Carola-Bibiane Schnlieb","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/1912.03623v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14105v1","updated":"2023-08-27T13:22:55Z","published":"2023-08-27T13:22:55Z","title":"Unified and Dynamic Graph for Temporal Character Grouping in Long Videos","summary":" Video temporal character grouping locates appearing moments of major\ncharacters within a video according to their identities. To this end, recent\nworks have evolved from unsupervised clustering to graph-based supervised\nclustering. However, graph methods are built upon the premise of fixed affinity\ngraphs, bringing many inexact connections. Besides, they extract multi-modal\nfeatures with kinds of models, which are unfriendly to deployment. In this\npaper, we present a unified and dynamic graph (UniDG) framework for temporal\ncharacter grouping. This is accomplished firstly by a unified representation\nnetwork that learns representations of multiple modalities within the same\nspace and still preserves the modality's uniqueness simultaneously. Secondly,\nwe present a dynamic graph clustering where the neighbors of different\nquantities are dynamically constructed for each node via a cyclic matching\nstrategy, leading to a more reliable affinity graph. Thirdly, a progressive\nassociation method is introduced to exploit spatial and temporal contexts among\ndifferent modalities, allowing multi-modal clustering results to be well fused.\nAs current datasets only provide pre-extracted features, we evaluate our UniDG\nmethod on a collected dataset named MTCG, which contains each character's\nappearing clips of face and body and speaking voice tracks. We also evaluate\nour key components on existing clustering and retrieval datasets to verify the\ngeneralization ability. Experimental results manifest that our method can\nachieve promising results and outperform several state-of-the-art approaches.\n","authors":["Xiujun Shu","Wei Wen","Liangsheng Xu","Mingbao Lin","Ruizhi Qiao","Taian Guo","Hanjun Li","Bei Gan","Xiao Wang","Xin Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14103v1","updated":"2023-08-27T13:17:34Z","published":"2023-08-27T13:17:34Z","title":"Towards Unified Token Learning for Vision-Language Tracking","summary":" In this paper, we present a simple, flexible and effective vision-language\n(VL) tracking pipeline, termed \\textbf{MMTrack}, which casts VL tracking as a\ntoken generation task. Traditional paradigms address VL tracking task\nindirectly with sophisticated prior designs, making them over-specialize on the\nfeatures of specific architectures or mechanisms. In contrast, our proposed\nframework serializes language description and bounding box into a sequence of\ndiscrete tokens. In this new design paradigm, all token queries are required to\nperceive the desired target and directly predict spatial coordinates of the\ntarget in an auto-regressive manner. The design without other prior modules\navoids multiple sub-tasks learning and hand-designed loss functions,\nsignificantly reducing the complexity of VL tracking modeling and allowing our\ntracker to use a simple cross-entropy loss as unified optimization objective\nfor VL tracking task. Extensive experiments on TNL2K, LaSOT, LaSOT$_{\\rm{ext}}$\nand OTB99-Lang benchmarks show that our approach achieves promising results,\ncompared to other state-of-the-arts.\n","authors":["Yaozong Zheng","Bineng Zhong","Qihua Liang","Guorong Li","Rongrong Ji","Xianxian Li"],"pdf_url":"https://arxiv.org/pdf/2308.14103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14101v1","updated":"2023-08-27T13:13:28Z","published":"2023-08-27T13:13:28Z","title":"Superpixels algorithms through network community detection","summary":" Community detection is a powerful tool from complex networks analysis that\nfinds applications in various research areas. Several image segmentation\nmethods rely for instance on community detection algorithms as a black box in\norder to compute undersegmentations, i.e. a small number of regions that\nrepresent areas of interest of the image. However, to the best of our\nknowledge, the efficiency of such an approach w.r.t. superpixels, that aim at\nrepresenting the image at a smaller level while preserving as much as possible\noriginal information, has been neglected so far. The only related work seems to\nbe the one by Liu et. al. (IET Image Processing, 2022) that developed a\nsuperpixels algorithm using a so-called modularity maximization approach,\nleading to relevant results. We follow this line of research by studying the\nefficiency of superpixels computed by state-of-the-art community detection\nalgorithms on a 4-connected pixel graph, so-called pixel-grid. We first detect\ncommunities on such a graph and then apply a simple merging procedure that\nallows to obtain the desired number of superpixels. As we shall see, such\nmethods result in the computation of relevant superpixels as emphasized by both\nqualitative and quantitative experiments, according to different widely-used\nmetrics based on ground-truth comparison or on superpixels only. We observe\nthat the choice of the community detection algorithm has a great impact on the\nnumber of communities and hence on the merging procedure. Similarly, small\nvariations on the pixel-grid may provide different results from both\nqualitative and quantitative viewpoints. For the sake of completeness, we\ncompare our results with those of several state-of-the-art superpixels\nalgorithms as computed by Stutz et al. (Computer Vision and Image\nUnderstanding, 2018).\n","authors":["Anthony Perez"],"pdf_url":"https://arxiv.org/pdf/2308.14101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14100v1","updated":"2023-08-27T13:07:44Z","published":"2023-08-27T13:07:44Z","title":"Rethinking Exemplars for Continual Semantic Segmentation in Endoscopy\n Scenes: Entropy-based Mini-Batch Pseudo-Replay","summary":" Endoscopy is a widely used technique for the early detection of diseases or\nrobotic-assisted minimally invasive surgery (RMIS). Numerous deep learning\n(DL)-based research works have been developed for automated diagnosis or\nprocessing of endoscopic view. However, existing DL models may suffer from\ncatastrophic forgetting. When new target classes are introduced over time or\ncross institutions, the performance of old classes may suffer severe\ndegradation. More seriously, data privacy and storage issues may lead to the\nunavailability of old data when updating the model. Therefore, it is necessary\nto develop a continual learning (CL) methodology to solve the problem of\ncatastrophic forgetting in endoscopic image segmentation. To tackle this, we\npropose a Endoscopy Continual Semantic Segmentation (EndoCSS) framework that\ndoes not involve the storage and privacy issues of exemplar data. The framework\nincludes a mini-batch pseudo-replay (MB-PR) mechanism and a self-adaptive noisy\ncross-entropy (SAN-CE) loss. The MB-PR strategy circumvents privacy and storage\nissues by generating pseudo-replay images through a generative model.\nMeanwhile, the MB-PR strategy can also correct the model deviation to the\nreplay data and current training data, which is aroused by the significant\ndifference in the amount of current and replay images. Therefore, the model can\nperform effective representation learning on both new and old tasks. SAN-CE\nloss can help model fitting by adjusting the model's output logits, and also\nimprove the robustness of training. Extensive continual semantic segmentation\n(CSS) experiments on public datasets demonstrate that our method can robustly\nand effectively address the catastrophic forgetting brought by class increment\nin endoscopy scenes. The results show that our framework holds excellent\npotential for real-world deployment in a streaming learning manner.\n","authors":["Guankun Wang","Long Bai","Yanan Wu","Tong Chen","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.14100v1.pdf","comment":"Accepted by Computers in Biology and Medicine"},{"id":"http://arxiv.org/abs/2307.00724v3","updated":"2023-08-27T12:49:57Z","published":"2023-07-03T03:09:44Z","title":"LXL: LiDAR Excluded Lean 3D Object Detection with 4D Imaging Radar and\n Camera Fusion","summary":" As an emerging technology and a relatively affordable device, the 4D imaging\nradar has already been confirmed effective in performing 3D object detection in\nautonomous driving. Nevertheless, the sparsity and noisiness of 4D radar point\nclouds hinder further performance improvement, and in-depth studies about its\nfusion with other modalities are lacking. On the other hand, as a new image\nview transformation strategy, \"sampling\" has been applied in a few image-based\ndetectors and shown to outperform the widely applied \"depth-based splatting\"\nproposed in Lift-Splat-Shoot (LSS), even without image depth prediction.\nHowever, the potential of \"sampling\" is not fully unleashed. In this paper, we\ninvestigate the \"sampling\" view transformation strategy on the camera and 4D\nimaging radar fusion-based 3D object detection. In the proposed LiDAR Excluded\nLean (LXL) model, predicted image depth distribution maps and radar 3D\noccupancy grids are generated from image perspective view (PV) features and\nradar bird's eye view (BEV) features, respectively. They are sent to the core\nof LXL, called \"radar occupancy-assisted depth-based sampling\", to aid image\nview transformation. Introducing image depths and radar information enhances\nthe \"sampling\" strategy and leads to more accurate view transformation.\nExperiments on VoD and TJ4DRadSet datasets show that the proposed method\noutperforms the state-of-the-art 3D object detection methods by a significant\nmargin without bells and whistles. Ablation studies demonstrate that our method\nperforms the best among different enhancement settings.\n","authors":["Weiyi Xiong","Jianan Liu","Tao Huang","Qing-Long Han","Yuxuan Xia","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.00724v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14087v1","updated":"2023-08-27T12:20:28Z","published":"2023-08-27T12:20:28Z","title":"A comprehensive review on Plant Leaf Disease detection using Deep\n learning","summary":" Leaf disease is a common fatal disease for plants. Early diagnosis and\ndetection is necessary in order to improve the prognosis of leaf diseases\naffecting plant. For predicting leaf disease, several automated systems have\nalready been developed using different plant pathology imaging modalities. This\npaper provides a systematic review of the literature on leaf disease-based\nmodels for the diagnosis of various plant leaf diseases via deep learning. The\nadvantages and limitations of different deep learning models including Vision\nTransformer (ViT), Deep convolutional neural network (DCNN), Convolutional\nneural network (CNN), Residual Skip Network-based Super-Resolution for Leaf\nDisease Detection (RSNSR-LDD), Disease Detection Network (DDN), and YOLO (You\nonly look once) are described in this review. The review also shows that the\nstudies related to leaf disease detection applied different deep learning\nmodels to a number of publicly available datasets. For comparing the\nperformance of the models, different metrics such as accuracy, precision,\nrecall, etc. were used in the existing studies.\n","authors":["Sumaya Mustofa","Md Mehedi Hasan Munna","Yousuf Rayhan Emon","Golam Rabbany","Md Taimur Ahad"],"pdf_url":"https://arxiv.org/pdf/2308.14087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14084v1","updated":"2023-08-27T12:12:27Z","published":"2023-08-27T12:12:27Z","title":"Practical Edge Detection via Robust Collaborative Learning","summary":" Edge detection, as a core component in a wide range of visionoriented tasks,\nis to identify object boundaries and prominent edges in natural images. An edge\ndetector is desired to be both efficient and accurate for practical use. To\nachieve the goal, two key issues should be concerned: 1) How to liberate deep\nedge models from inefficient pre-trained backbones that are leveraged by most\nexisting deep learning methods, for saving the computational cost and cutting\nthe model size; and 2) How to mitigate the negative influence from noisy or\neven wrong labels in training data, which widely exist in edge detection due to\nthe subjectivity and ambiguity of annotators, for the robustness and accuracy.\nIn this paper, we attempt to simultaneously address the above problems via\ndeveloping a collaborative learning based model, termed PEdger. The principle\nbehind our PEdger is that, the information learned from different training\nmoments and heterogeneous (recurrent and non recurrent in this work)\narchitectures, can be assembled to explore robust knowledge against noisy\nannotations, even without the help of pre-training on extra data. Extensive\nablation studies together with quantitative and qualitative experimental\ncomparisons on the BSDS500 and NYUD datasets are conducted to verify the\neffectiveness of our design, and demonstrate its superiority over other\ncompetitors in terms of accuracy, speed, and model size. Codes can be found at\nhttps://github.co/ForawardStar/PEdger.\n","authors":["Yuanbin Fu","Xiaojie Guo"],"pdf_url":"https://arxiv.org/pdf/2308.14084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14083v1","updated":"2023-08-27T12:08:49Z","published":"2023-08-27T12:08:49Z","title":"4D Myocardium Reconstruction with Decoupled Motion and Shape Model","summary":" Estimating the shape and motion state of the myocardium is essential in\ndiagnosing cardiovascular diseases.However, cine magnetic resonance (CMR)\nimaging is dominated by 2D slices, whose large slice spacing challenges\ninter-slice shape reconstruction and motion acquisition.To address this\nproblem, we propose a 4D reconstruction method that decouples motion and shape,\nwhich can predict the inter-/intra- shape and motion estimation from a given\nsparse point cloud sequence obtained from limited slices. Our framework\ncomprises a neural motion model and an end-diastolic (ED) shape model. The\nimplicit ED shape model can learn a continuous boundary and encourage the\nmotion model to predict without the supervision of ground truth deformation,\nand the motion model enables canonical input of the shape model by deforming\nany point from any phase to the ED phase. Additionally, the constructed\nED-space enables pre-training of the shape model, thereby guiding the motion\nmodel and addressing the issue of data scarcity. We propose the first 4D\nmyocardial dataset as we know and verify our method on the proposed, public,\nand cross-modal datasets, showing superior reconstruction performance and\nenabling various clinical applications.\n","authors":["Xiaohan Yuan","Cong Liu","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14083v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14082v1","updated":"2023-08-27T12:01:11Z","published":"2023-08-27T12:01:11Z","title":"Reconstructing Interacting Hands with Interaction Prior from Monocular\n Images","summary":" Reconstructing interacting hands from monocular images is indispensable in\nAR/VR applications. Most existing solutions rely on the accurate localization\nof each skeleton joint. However, these methods tend to be unreliable due to the\nsevere occlusion and confusing similarity among adjacent hand parts. This also\ndefies human perception because humans can quickly imitate an interaction\npattern without localizing all joints. Our key idea is to first construct a\ntwo-hand interaction prior and recast the interaction reconstruction task as\nthe conditional sampling from the prior. To expand more interaction states, a\nlarge-scale multimodal dataset with physical plausibility is proposed. Then a\nVAE is trained to further condense these interaction patterns as latent codes\nin a prior distribution. When looking for image cues that contribute to\ninteraction prior sampling, we propose the interaction adjacency heatmap (IAH).\nCompared with a joint-wise heatmap for localization, IAH assigns denser visible\nfeatures to those invisible joints. Compared with an all-in-one visible\nheatmap, it provides more fine-grained local interaction information in each\ninteraction region. Finally, the correlations between the extracted features\nand corresponding interaction codes are linked by the ViT module. Comprehensive\nevaluations on benchmark datasets have verified the effectiveness of this\nframework. The code and dataset are publicly available at\nhttps://github.com/binghui-z/InterPrior_pytorch\n","authors":["Binghui Zuo","Zimeng Zhao","Wenqian Sun","Wei Xie","Zhou Xue","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14082v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14081v1","updated":"2023-08-27T11:57:56Z","published":"2023-08-27T11:57:56Z","title":"U-SEANNet: A Simple, Efficient and Applied U-Shaped Network for\n Diagnosing Nasal Diseases from Nasal Endoscopic Images","summary":" Utilizing deep learning (DL) models to improve the early diagnosis of nasal\ndiseases from nasal endoscopic images holds paramount importance. However, the\nlack of available datasets stymies advancements in this field. Furthermore,\nexisting models fail to strike a good trade-off between model diagnosis\nperformance, model complexity and parameter size, rendering them unsuitable for\npractical application. To bridge these gaps, we created the first large-scale\nnasal endoscopy dataset, named 7-NasEID, comprising 11,352 images that span six\nnasal diseases and normal samples. Building on this, we proposed U-SEANNet, an\ninnovative architecture, underpinned by depth-wise separable convolutions.\nAdditionally, to augment its discernment capabilities for subtle variations in\ninput images, we further proposed the Global-Local Channel Feature Fusion\nModule, enabling the U-SEANNet to focus salient channel features from both\nglobal and local contexts. Notably, U-SEANNet's parameter size and GFLOPs are\nonly 0.78M and 0.21, respectively. Employing the 7-NasalEID, we conducted the\nfive-fold cross-validation on U-SEANNet, juxtaposing its performance against\nseventeen renowned architectures. The experimental results suggest U-SEANNet as\nthe state-of-the-art (SOTA) model, achieves an accuracy of 93.58%, sensitivity\nof 90.17%, and specificity of 91.27%. These findings demonstrate U-SEANNet's\nprodigious potential for diagnosing nasal diseases in practical use, providing\nthe development of efficacy nasal diseases diagnosis tools with a new insight.\n","authors":["Yubiao Yue","Jun Xue","Haihua Liang","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.14081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08648v3","updated":"2023-08-27T11:52:50Z","published":"2023-06-14T17:28:45Z","title":"SimpleMapping: Real-Time Visual-Inertial Dense Mapping with Deep\n Multi-View Stereo","summary":" We present a real-time visual-inertial dense mapping method capable of\nperforming incremental 3D mesh reconstruction with high quality using only\nsequential monocular images and inertial measurement unit (IMU) readings. 6-DoF\ncamera poses are estimated by a robust feature-based visual-inertial odometry\n(VIO), which also generates noisy sparse 3D map points as a by-product. We\npropose a sparse point aided multi-view stereo neural network (SPA-MVSNet) that\ncan effectively leverage the informative but noisy sparse points from the VIO\nsystem. The sparse depth from VIO is firstly completed by a single-view depth\ncompletion network. This dense depth map, although naturally limited in\naccuracy, is then used as a prior to guide our MVS network in the cost volume\ngeneration and regularization for accurate dense depth prediction. Predicted\ndepth maps of keyframe images by the MVS network are incrementally fused into a\nglobal map using TSDF-Fusion. We extensively evaluate both the proposed\nSPA-MVSNet and the entire visual-inertial dense mapping system on several\npublic datasets as well as our own dataset, demonstrating the system's\nimpressive generalization capabilities and its ability to deliver high-quality\n3D mesh reconstruction online. Our proposed dense mapping system achieves a\n39.7% improvement in F-score over existing systems when evaluated on the\nchallenging scenarios of the EuRoC dataset.\n","authors":["Yingye Xin","Xingxing Zuo","Dongyue Lu","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2306.08648v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14078v1","updated":"2023-08-27T11:52:00Z","published":"2023-08-27T11:52:00Z","title":"Sparse3D: Distilling Multiview-Consistent Diffusion for Object\n Reconstruction from Sparse Views","summary":" Reconstructing 3D objects from extremely sparse views is a long-standing and\nchallenging problem. While recent techniques employ image diffusion models for\ngenerating plausible images at novel viewpoints or for distilling pre-trained\ndiffusion priors into 3D representations using score distillation sampling\n(SDS), these methods often struggle to simultaneously achieve high-quality,\nconsistent, and detailed results for both novel-view synthesis (NVS) and\ngeometry. In this work, we present Sparse3D, a novel 3D reconstruction method\ntailored for sparse view inputs. Our approach distills robust priors from a\nmultiview-consistent diffusion model to refine a neural radiance field.\nSpecifically, we employ a controller that harnesses epipolar features from\ninput views, guiding a pre-trained diffusion model, such as Stable Diffusion,\nto produce novel-view images that maintain 3D consistency with the input. By\ntapping into 2D priors from powerful image diffusion models, our integrated\nmodel consistently delivers high-quality results, even when faced with\nopen-world objects. To address the blurriness introduced by conventional SDS,\nwe introduce the category-score distillation sampling (C-SDS) to enhance\ndetail. We conduct experiments on CO3DV2 which is a multi-view dataset of\nreal-world objects. Both quantitative and qualitative evaluations demonstrate\nthat our approach outperforms previous state-of-the-art works on the metrics\nregarding NVS and geometry reconstruction.\n","authors":["Zi-Xin Zou","Weihao Cheng","Yan-Pei Cao","Shi-Sheng Huang","Ying Shan","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14076v1","updated":"2023-08-27T11:49:46Z","published":"2023-08-27T11:49:46Z","title":"A Novel Multi-scale Attention Feature Extraction Block for Aerial Remote\n Sensing Image Classification","summary":" Classification of very high-resolution (VHR) aerial remote sensing (RS)\nimages is a well-established research area in the remote sensing community as\nit provides valuable spatial information for decision-making. Existing works on\nVHR aerial RS image classification produce an excellent classification\nperformance; nevertheless, they have a limited capability to well-represent VHR\nRS images having complex and small objects, thereby leading to performance\ninstability. As such, we propose a novel plug-and-play multi-scale attention\nfeature extraction block (MSAFEB) based on multi-scale convolution at two\nlevels with skip connection, producing discriminative/salient information at a\ndeeper/finer level. The experimental study on two benchmark VHR aerial RS image\ndatasets (AID and NWPU) demonstrates that our proposal achieves a\nstable/consistent performance (minimum standard deviation of $0.002$) and\ncompetent overall classification performance (AID: 95.85\\% and NWPU: 94.09\\%).\n","authors":["Chiranjibi Sitaula","Jagannath Aryal","Avik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.14076v1.pdf","comment":"The paper is under review in IEEE Geoscience and Remote Sensing\n Letters Journal (IEEE-GRSL). This version may be deleted and/or updated based\n on the journal's policy"},{"id":"http://arxiv.org/abs/2308.14075v1","updated":"2023-08-27T11:38:42Z","published":"2023-08-27T11:38:42Z","title":"FaceCoresetNet: Differentiable Coresets for Face Set Recognition","summary":" In set-based face recognition, we aim to compute the most discriminative\ndescriptor from an unbounded set of images and videos showing a single person.\nA discriminative descriptor balances two policies when aggregating information\nfrom a given set. The first is a quality-based policy: emphasizing high-quality\nand down-weighting low-quality images. The second is a diversity-based policy:\nemphasizing unique images in the set and down-weighting multiple occurrences of\nsimilar images as found in video clips which can overwhelm the set\nrepresentation. This work frames face-set representation as a differentiable\ncoreset selection problem. Our model learns how to select a small coreset of\nthe input set that balances quality and diversity policies using a learned\nmetric parameterized by the face quality, optimized end-to-end. The selection\nprocess is a differentiable farthest-point sampling (FPS) realized by\napproximating the non-differentiable Argmax operation with differentiable\nsampling from the Gumbel-Softmax distribution of distances. The small coreset\nis later used as queries in a self and cross-attention architecture to enrich\nthe descriptor with information from the whole set. Our model is\norder-invariant and linear in the input set size. We set a new SOTA to set face\nverification on the IJB-B and IJB-C datasets. Our code is publicly available.\n","authors":["Gil Shapira","Yosi Keller"],"pdf_url":"https://arxiv.org/pdf/2308.14075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14074v1","updated":"2023-08-27T11:37:26Z","published":"2023-08-27T11:37:26Z","title":"Nonrigid Object Contact Estimation With Regional Unwrapping Transformer","summary":" Acquiring contact patterns between hands and nonrigid objects is a common\nconcern in the vision and robotics community. However, existing learning-based\nmethods focus more on contact with rigid ones from monocular images. When\nadopting them for nonrigid contact, a major problem is that the existing\ncontact representation is restricted by the geometry of the object.\nConsequently, contact neighborhoods are stored in an unordered manner and\ncontact features are difficult to align with image cues. At the core of our\napproach lies a novel hand-object contact representation called RUPs (Region\nUnwrapping Profiles), which unwrap the roughly estimated hand-object surfaces\nas multiple high-resolution 2D regional profiles. The region grouping strategy\nis consistent with the hand kinematic bone division because they are the\nprimitive initiators for a composite contact pattern. Based on this\nrepresentation, our Regional Unwrapping Transformer (RUFormer) learns the\ncorrelation priors across regions from monocular inputs and predicts\ncorresponding contact and deformed transformations. Our experiments demonstrate\nthat the proposed framework can robustly estimate the deformed degrees and\ndeformed transformations, which makes it suitable for both nonrigid and rigid\ncontact.\n","authors":["Wei Xie","Zimeng Zhao","Shiying Li","Binghui Zuo","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14074v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14070v1","updated":"2023-08-27T11:04:26Z","published":"2023-08-27T11:04:26Z","title":"DETDet: Dual Ensemble Teeth Detection","summary":" The field of dentistry is in the era of digital transformation. Particularly,\nartificial intelligence is anticipated to play a significant role in digital\ndentistry. AI holds the potential to significantly assist dental practitioners\nand elevate diagnostic accuracy. In alignment with this vision, the 2023 MICCAI\nDENTEX challenge aims to enhance the performance of dental panoramic X-ray\ndiagnosis and enumeration through technological advancement. In response, we\nintroduce DETDet, a Dual Ensemble Teeth Detection network. DETDet encompasses\ntwo distinct modules dedicated to enumeration and diagnosis. Leveraging the\nadvantages of teeth mask data, we employ Mask-RCNN for the enumeration module.\nFor the diagnosis module, we adopt an ensemble model comprising DiffusionDet\nand DINO. To further enhance precision scores, we integrate a complementary\nmodule to harness the potential of unlabeled data. The code for our approach\nwill be made accessible at https://github.com/Bestever-choi/Evident\n","authors":["Kyoungyeon Choi","Jaewon Shin","Eunyi Lyou"],"pdf_url":"https://arxiv.org/pdf/2308.14070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14066v1","updated":"2023-08-27T10:39:33Z","published":"2023-08-27T10:39:33Z","title":"Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential\n Generative Adversarial Networks","summary":" In this paper, we propose a bi-modality medical image synthesis approach\nbased on sequential generative adversarial network (GAN) and semi-supervised\nlearning. Our approach consists of two generative modules that synthesize\nimages of the two modalities in a sequential order. A method for measuring the\nsynthesis complexity is proposed to automatically determine the synthesis order\nin our sequential GAN. Images of the modality with a lower complexity are\nsynthesized first, and the counterparts with a higher complexity are generated\nlater. Our sequential GAN is trained end-to-end in a semi-supervised manner. In\nsupervised training, the joint distribution of bi-modality images are learned\nfrom real paired images of the two modalities by explicitly minimizing the\nreconstruction losses between the real and synthetic images. To avoid\noverfitting limited training images, in unsupervised training, the marginal\ndistribution of each modality is learned based on unpaired images by minimizing\nthe Wasserstein distance between the distributions of real and fake images. We\ncomprehensively evaluate the proposed model using two synthesis tasks based on\nthree types of evaluate metrics and user studies. Visual and quantitative\nresults demonstrate the superiority of our method to the state-of-the-art\nmethods, and reasonable visual quality and clinical significance. Code is made\npublicly available at\nhttps://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis.\n","authors":["Xin Yang","Yi Lin","Zhiwei Wang","Xin Li","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.14066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14064v1","updated":"2023-08-27T10:32:52Z","published":"2023-08-27T10:32:52Z","title":"Multi-model fusion for Aerial Vision and Dialog Navigation based on\n human attention aids","summary":" Drones have been widely used in many areas of our daily lives. It relieves\npeople of the burden of holding a controller all the time and makes drone\ncontrol easier to use for people with disabilities or occupied hands. However,\nthe control of aerial robots is more complicated compared to normal robots due\nto factors such as uncontrollable height. Therefore, it is crucial to develop\nan intelligent UAV that has the ability to talk to humans and follow natural\nlanguage commands. In this report, we present an aerial navigation task for the\n2023 ICCV Conversation History. Based on the AVDN dataset containing more than\n3k recorded navigation trajectories and asynchronous human-robot conversations,\nwe propose an effective method of fusion training of Human Attention Aided\nTransformer model (HAA-Transformer) and Human Attention Aided LSTM (HAA-LSTM)\nmodel, which achieves the prediction of the navigation routing points and human\nattention. The method not only achieves high SR and SPL metrics, but also shows\na 7% improvement in GP metrics compared to the baseline model.\n","authors":["Xinyi Wang","Xuan Cui","Danxu Li","Fang Liu","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2308.14064v1.pdf","comment":"4 pages, 1 figures"},{"id":"http://arxiv.org/abs/2302.08715v2","updated":"2023-08-27T10:08:54Z","published":"2023-02-17T06:14:37Z","title":"EEP-3DQA: Efficient and Effective Projection-based 3D Model Quality\n Assessment","summary":" Currently, great numbers of efforts have been put into improving the\neffectiveness of 3D model quality assessment (3DQA) methods. However, little\nattention has been paid to the computational costs and inference time, which is\nalso important for practical applications. Unlike 2D media, 3D models are\nrepresented by more complicated and irregular digital formats, such as point\ncloud and mesh. Thus it is normally difficult to perform an efficient module to\nextract quality-aware features of 3D models. In this paper, we address this\nproblem from the aspect of projection-based 3DQA and develop a no-reference\n(NR) \\underline{E}fficient and \\underline{E}ffective\n\\underline{P}rojection-based \\underline{3D} Model \\underline{Q}uality\n\\underline{A}ssessment (\\textbf{EEP-3DQA}) method. The input projection images\nof EEP-3DQA are randomly sampled from the six perpendicular viewpoints of the\n3D model and are further spatially downsampled by the grid-mini patch sampling\nstrategy. Further, the lightweight Swin-Transformer tiny is utilized as the\nbackbone to extract the quality-aware features. Finally, the proposed EEP-3DQA\nand EEP-3DQA-t (tiny version) achieve the best performance than the existing\nstate-of-the-art NR-3DQA methods and even outperforms most full-reference (FR)\n3DQA methods on the point cloud and mesh quality assessment databases while\nconsuming less inference time than the compared 3DQA methods.\n","authors":["Zicheng Zhang","Wei Sun","Yingjie Zhou","Wei Lu","Yucheng Zhu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2302.08715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14061v1","updated":"2023-08-27T10:03:48Z","published":"2023-08-27T10:03:48Z","title":"Hierarchical Contrastive Learning for Pattern-Generalizable Image\n Corruption Detection","summary":" Effective image restoration with large-size corruptions, such as blind image\ninpainting, entails precise detection of corruption region masks which remains\nextremely challenging due to diverse shapes and patterns of corruptions. In\nthis work, we present a novel method for automatic corruption detection, which\nallows for blind corruption restoration without known corruption masks.\nSpecifically, we develop a hierarchical contrastive learning framework to\ndetect corrupted regions by capturing the intrinsic semantic distinctions\nbetween corrupted and uncorrupted regions. In particular, our model detects the\ncorrupted mask in a coarse-to-fine manner by first predicting a coarse mask by\ncontrastive learning in low-resolution feature space and then refines the\nuncertain area of the mask by high-resolution contrastive learning. A\nspecialized hierarchical interaction mechanism is designed to facilitate the\nknowledge propagation of contrastive learning in different scales, boosting the\nmodeling performance substantially. The detected multi-scale corruption masks\nare then leveraged to guide the corruption restoration. Detecting corrupted\nregions by learning the contrastive distinctions rather than the semantic\npatterns of corruptions, our model has well generalization ability across\ndifferent corruption patterns. Extensive experiments demonstrate following\nmerits of our model: 1) the superior performance over other methods on both\ncorruption detection and various image restoration tasks including blind\ninpainting and watermark removal, and 2) strong generalization across different\ncorruption patterns such as graffiti, random noise or other image content.\nCodes and trained weights are available at https://github.com/xyfJASON/HCL .\n","authors":["Xin Feng","Yifeng Xu","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.14061v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14058v1","updated":"2023-08-27T09:45:41Z","published":"2023-08-27T09:45:41Z","title":"Pruning the Unlabeled Data to Improve Semi-Supervised Learning","summary":" In the domain of semi-supervised learning (SSL), the conventional approach\ninvolves training a learner with a limited amount of labeled data alongside a\nsubstantial volume of unlabeled data, both drawn from the same underlying\ndistribution. However, for deep learning models, this standard practice may not\nyield optimal results. In this research, we propose an alternative perspective,\nsuggesting that distributions that are more readily separable could offer\nsuperior benefits to the learner as compared to the original distribution. To\nachieve this, we present PruneSSL, a practical technique for selectively\nremoving examples from the original unlabeled dataset to enhance its\nseparability. We present an empirical study, showing that although PruneSSL\nreduces the quantity of available training data for the learner, it\nsignificantly improves the performance of various competitive SSL algorithms,\nthereby achieving state-of-the-art results across several image classification\ntasks.\n","authors":["Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14058v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.14165v1","updated":"2023-08-27T17:58:32Z","published":"2023-08-27T17:58:32Z","title":"Distributional Off-Policy Evaluation for Slate Recommendations","summary":" Recommendation strategies are typically evaluated by using previously logged\ndata, employing off-policy evaluation methods to estimate their expected\nperformance. However, for strategies that present users with slates of multiple\nitems, the resulting combinatorial action space renders many of these methods\nimpractical. Prior work has developed estimators that leverage the structure in\nslates to estimate the expected off-policy performance, but the estimation of\nthe entire performance distribution remains elusive. Estimating the complete\ndistribution allows for a more comprehensive evaluation of recommendation\nstrategies, particularly along the axes of risk and fairness that employ\nmetrics computable from the distribution. In this paper, we propose an\nestimator for the complete off-policy performance distribution for slates and\nestablish conditions under which the estimator is unbiased and consistent. This\nbuilds upon prior work on off-policy evaluation for slates and off-policy\ndistribution estimation in reinforcement learning. We validate the efficacy of\nour method empirically on synthetic data as well as on a slate recommendation\nsimulator constructed from real-world data (MovieLens-20M). Our results show a\nsignificant reduction in estimation variance and improved sample efficiency\nover prior work across a range of slate structures.\n","authors":["Shreyas Chaudhari","David Arbour","Georgios Theocharous","Nikos Vlassis"],"pdf_url":"https://arxiv.org/pdf/2308.14165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14155v1","updated":"2023-08-27T16:43:06Z","published":"2023-08-27T16:43:06Z","title":"Only Encode Once: Making Content-based News Recommender Greener","summary":" Large pretrained language models (PLM) have become de facto news encoders in\nmodern news recommender systems, due to their strong ability in comprehending\ntextual content. These huge Transformer-based architectures, when finetuned on\nrecommendation tasks, can greatly improve news recommendation performance.\nHowever, the PLM-based pretrain-finetune framework incurs high computational\ncost and energy consumption, primarily due to the extensive redundant\nprocessing of news encoding during each training epoch. In this paper, we\npropose the ``Only Encode Once'' framework for news recommendation (OLEO), by\ndecoupling news representation learning from downstream recommendation task\nlearning. The decoupled design makes content-based news recommender as green\nand efficient as id-based ones, leading to great reduction in computational\ncost and training resources. Extensive experiments show that our OLEO framework\ncan reduce carbon emissions by up to 13 times compared with the\nstate-of-the-art pretrain-finetune framework and maintain a competitive or even\nsuperior performance level. The source code is released for reproducibility.\n","authors":["Qijiong Liu","Jieming Zhu","Quanyu Dai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.14155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.00676v2","updated":"2023-08-27T16:26:57Z","published":"2022-09-01T18:24:16Z","title":"Analyzing and visualizing polarization and balance with signed networks:\n the U.S. Congress case study","summary":" Signed networks and balance theory provide a natural setting for real-world\nscenarios that show polarization dynamics, positive/negative relationships, and\npolitical partisanship. For example, they have been proven effective in\nstudying the increasing polarization of the votes in the two chambers of the\nU.S. Congress from World War II on.\n To provide further insights into this particular case study, we propose the\napplication of a pipeline to analyze and visualize a signed graph's\nconfiguration based on the exploitation of the corresponding Laplacian matrix'\nspectral properties. The overall methodology is comparable with others based on\nthe frustration index, but it has at least two main advantages: first, it\nrequires a much lower computational cost; second, it allows for a quantitative\nand visual assessment of how arbitrarily small subgraphs (even single nodes)\ncontribute to the overall balance (or unbalance) of the network.\n The proposed pipeline allows the exploration of polarization dynamics shown\nby the U.S. Congress from 1945 to 2020 at different resolution scales. In fact,\nwe are able to spot and point out the influence of some (groups of) congressmen\nin the overall balance, as well as to observe and explore polarization's\nevolution of both chambers across the years.\n","authors":["Arthur Capozzi","Alfonso Semeraro","Giancarlo Ruffo"],"pdf_url":"https://arxiv.org/pdf/2209.00676v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v3","updated":"2023-08-27T11:21:38Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v3.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2304.07920v2","updated":"2023-08-27T10:09:22Z","published":"2023-04-17T00:05:52Z","title":"Causal Decision Transformer for Recommender Systems via Offline\n Reinforcement Learning","summary":" Reinforcement learning-based recommender systems have recently gained\npopularity. However, the design of the reward function, on which the agent\nrelies to optimize its recommendation policy, is often not straightforward.\nExploring the causality underlying users' behavior can take the place of the\nreward function in guiding the agent to capture the dynamic interests of users.\nMoreover, due to the typical limitations of simulation environments (e.g., data\ninefficiency), most of the work cannot be broadly applied in large-scale\nsituations. Although some works attempt to convert the offline dataset into a\nsimulator, data inefficiency makes the learning process even slower. Because of\nthe nature of reinforcement learning (i.e., learning by interaction), it cannot\ncollect enough data to train during a single interaction. Furthermore,\ntraditional reinforcement learning algorithms do not have a solid capability\nlike supervised learning methods to learn from offline datasets directly. In\nthis paper, we propose a new model named the causal decision transformer for\nrecommender systems (CDT4Rec). CDT4Rec is an offline reinforcement learning\nsystem that can learn from a dataset rather than from online interaction.\nMoreover, CDT4Rec employs the transformer architecture, which is capable of\nprocessing large offline datasets and capturing both short-term and long-term\ndependencies within the data to estimate the causal relationship between\naction, state, and reward. To demonstrate the feasibility and superiority of\nour model, we have conducted experiments on six real-world offline datasets and\none online simulator.\n","authors":["Siyu Wang","Xiaocong Chen","Dietmar Jannach","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2304.07920v2.pdf","comment":"Accepted by SIGIR'23, please check the camera-ready version for more\n details such as the implementation"},{"id":"http://arxiv.org/abs/2308.14056v1","updated":"2023-08-27T09:40:52Z","published":"2023-08-27T09:40:52Z","title":"CTR is not Enough: a Novel Reinforcement Learning based Ranking Approach\n for Optimizing Session Clicks","summary":" Ranking is a crucial module using in the recommender system. In particular,\nthe ranking module using in our YoungTao recommendation scenario is to provide\nan ordered list of items to users, to maximize the click number throughout the\nrecommendation session for each user. However, we found that the traditional\nranking method for optimizing Click-Through rate(CTR) cannot address our\nranking scenario well, since it completely ignores user leaving, and CTR is the\noptimization goal for the one-step recommendation. To effectively undertake the\npurpose of our ranking module, we propose a long-term optimization goal, named\nas CTE (Click-Through quantity expectation), for explicitly taking the behavior\nof user leaving into account. Based on CTE, we propose an effective model\ntrained by reinforcement learning. Moreover, we build a simulation environment\nfrom offline log data for estimating PBR and CTR. We conduct extensive\nexperiments on offline datasets and an online e-commerce scenario in TaoBao.\nExperimental results show that our method can boost performance effectively\n","authors":["Shaowei Liu","Yangjun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14029v1","updated":"2023-08-27T07:44:33Z","published":"2023-08-27T07:44:33Z","title":"Text Matching Improves Sequential Recommendation by Reducing Popularity\n Biases","summary":" This paper proposes Text mAtching based SequenTial rEcommendation model\n(TASTE), which maps items and users in an embedding space and recommends items\nby matching their text representations. TASTE verbalizes items and user-item\ninteractions using identifiers and attributes of items. To better characterize\nuser behaviors, TASTE additionally proposes an attention sparsity method, which\nenables TASTE to model longer user-item interactions by reducing the\nself-attention computations during encoding. Our experiments show that TASTE\noutperforms the state-of-the-art methods on widely used sequential\nrecommendation datasets. TASTE alleviates the cold start problem by\nrepresenting long-tail items using full-text modeling and bringing the benefits\nof pretrained language models to recommendation systems. Our further analyses\nillustrate that TASTE significantly improves the recommendation accuracy by\nreducing the popularity bias of previous item id based recommendation models\nand returning more appropriate and text-relevant items to satisfy users. All\ncodes are available at https://github.com/OpenMatch/TASTE.\n","authors":["Zhenghao Liu","Sen Mei","Chenyan Xiong","Xiaohua Li","Shi Yu","Zhiyuan Liu","Yu Gu","Ge Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14029v1.pdf","comment":"Accepted by CIKM 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.14224v1","updated":"2023-08-27T22:59:08Z","published":"2023-08-27T22:59:08Z","title":"Modeling Player Personality Factors from In-Game Behavior and Affective\n Expression","summary":" Developing a thorough understanding of the target audience (and/or single\nindividuals) is a key factor for success - which is exceptionally important and\npowerful for the domain of video games that can not only benefit from informed\ndecision making during development, but ideally even tailor game content,\ndifficulty and player experience while playing. The granular assessment of\nindividual personality and differences across players is a particularly\ndifficult endeavor, given the highly variant human nature, disagreement in\npsychological background models and because of the effortful data collection\nthat most often builds upon long, time-consuming and deterrent questionnaires.\nIn this work, we explore possibilities to predict a series of player\npersonality questionnaire metrics from recorded in-game behavior and extend\nrelated work by explicitly adding affective dialog decisions to the game\nenvironment which could elevate the model's accuracy. Using random forest\nregression, we predicted a wide variety of personality metrics from seven\nestablished questionnaires across 62 players over 60 minute gameplay of a\ncustomized version of the role-playing game Fallout: New Vegas. While some\npersonality variables could already be identified from reasonable underlying\nin-game actions and affective expressions, we did not find ways to predict\nothers or encountered questionable correlations that could not be justified by\ntheoretical background literature. Yet, building on the initial opportunities\nof this explorative study, we are striving to massively enlarge our data set to\nplayers from an ecologically valid industrial game environment and investigate\nthe performance of more sophisticated machine learning approaches.\n","authors":["Reza Habibi","Johannes Pfau","Magy Seif El-Nasr"],"pdf_url":"https://arxiv.org/pdf/2308.14224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14220v1","updated":"2023-08-27T22:42:31Z","published":"2023-08-27T22:42:31Z","title":"On Active Learning for Gaussian Process-based Global Sensitivity\n Analysis","summary":" This paper explores the application of active learning strategies to\nadaptively learn Sobol indices for global sensitivity analysis. We demonstrate\nthat active learning for Sobol indices poses unique challenges due to the\ndefinition of the Sobol index as a ratio of variances estimated from Gaussian\nprocess surrogates. Consequently, learning strategies must either focus on\nconvergence in the numerator or the denominator of this ratio. However, rapid\nconvergence in either one does not guarantee convergence in the Sobol index. We\npropose a novel strategy for active learning that focuses on resolving the main\neffects of the Gaussian process (associated with the numerator of the Sobol\nindex) and compare this with existing strategies based on convergence in the\ntotal variance (the denominator of the Sobol index). The new strategy,\nimplemented through a new learning function termed the MUSIC (minimize\nuncertainty in Sobol index convergence), generally converges in Sobol index\nerror more rapidly than the existing strategies based on the Expected\nImprovement for Global Fit (EIGF) and the Variance Improvement for Global Fit\n(VIGF). Both strategies are compared with simple sequential random sampling and\nthe MUSIC learning function generally converges most rapidly for\nlow-dimensional problems. However, for high-dimensional problems, the\nperformance is comparable to random sampling. The new learning strategy is\ndemonstrated for a practical case of adaptive experimental design for\nlarge-scale Boundary Layer Wind Tunnel experiments.\n","authors":["Mohit Chauhan","Mariel Ojeda-Tuz","Ryan Catarelli","Kurtis Gurley","Dimitrios Tsapetis","Michael D. Shields"],"pdf_url":"https://arxiv.org/pdf/2308.14220v1.pdf","comment":"31 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.14216v1","updated":"2023-08-27T22:34:10Z","published":"2023-08-27T22:34:10Z","title":"Machine Learning for Administrative Health Records: A Systematic Review\n of Techniques and Applications","summary":" Machine learning provides many powerful and effective techniques for\nanalysing heterogeneous electronic health records (EHR). Administrative Health\nRecords (AHR) are a subset of EHR collected for administrative purposes, and\nthe use of machine learning on AHRs is a growing subfield of EHR analytics.\nExisting reviews of EHR analytics emphasise that the data-modality of the EHR\nlimits the breadth of suitable machine learning techniques, and pursuable\nhealthcare applications. Despite emphasising the importance of data modality,\nthe literature fails to analyse which techniques and applications are relevant\nto AHRs. AHRs contain uniquely well-structured, categorically encoded records\nwhich are distinct from other data-modalities captured by EHRs, and they can\nprovide valuable information pertaining to how patients interact with the\nhealthcare system.\n This paper systematically reviews AHR-based research, analysing 70 relevant\nstudies and spanning multiple databases. We identify and analyse which machine\nlearning techniques are applied to AHRs and which health informatics\napplications are pursued in AHR-based research. We also analyse how these\ntechniques are applied in pursuit of each application, and identify the\nlimitations of these approaches. We find that while AHR-based studies are\ndisconnected from each other, the use of AHRs in health informatics research is\nsubstantial and accelerating. Our synthesis of these studies highlights the\nutility of AHRs for pursuing increasingly complex and diverse research\nobjectives despite a number of pervading data- and technique-based limitations.\nFinally, through our findings, we propose a set of future research directions\nthat can enhance the utility of AHR data and machine learning techniques for\nhealth informatics research.\n","authors":["Adrian Caruana","Madhushi Bandara","Katarzyna Musial","Daniel Catchpoole","Paul J. Kennedy"],"pdf_url":"https://arxiv.org/pdf/2308.14216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14215v1","updated":"2023-08-27T22:27:57Z","published":"2023-08-27T22:27:57Z","title":"TimeTrail: Unveiling Financial Fraud Patterns through Temporal\n Correlation Analysis","summary":" In the field of financial fraud detection, understanding the underlying\npatterns and dynamics is important to ensure effective and reliable systems.\nThis research introduces a new technique, \"TimeTrail,\" which employs advanced\ntemporal correlation analysis to explain complex financial fraud patterns. The\ntechnique leverages time-related insights to provide transparent and\ninterpretable explanations for fraud detection decisions, enhancing\naccountability and trust.\n The \"TimeTrail\" methodology consists of three key phases: temporal data\nenrichment, dynamic correlation analysis, and interpretable pattern\nvisualization. Initially, raw financial transaction data is enriched with\ntemporal attributes. Dynamic correlations between these attributes are then\nquantified using innovative statistical measures. Finally, a unified\nvisualization framework presents these correlations in an interpretable manner.\nTo validate the effectiveness of \"TimeTrail,\" a study is conducted on a diverse\nfinancial dataset, surrounding various fraud scenarios. Results demonstrate the\ntechnique's capability to uncover hidden temporal correlations and patterns,\nperforming better than conventional methods in both accuracy and\ninterpretability. Moreover, a case study showcasing the application of\n\"TimeTrail\" in real-world scenarios highlights its utility for fraud detection.\n","authors":["Sushrut Ghimire"],"pdf_url":"https://arxiv.org/pdf/2308.14215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14207v1","updated":"2023-08-27T21:25:45Z","published":"2023-08-27T21:25:45Z","title":"Predictive Sparse Manifold Transform","summary":" We present Predictive Sparse Manifold Transform (PSMT), a minimalistic,\ninterpretable and biologically plausible framework for learning and predicting\nnatural dynamics. PSMT incorporates two layers where the first sparse coding\nlayer represents the input sequence as sparse coefficients over an overcomplete\ndictionary and the second manifold learning layer learns a geometric embedding\nspace that captures topological similarity and dynamic temporal linearity in\nsparse coefficients. We apply PSMT on a natural video dataset and evaluate the\nreconstruction performance with respect to contextual variability, the number\nof sparse coding basis functions and training samples. We then interpret the\ndynamic topological organization in the embedding space. We next utilize PSMT\nto predict future frames compared with two baseline methods with a static\nembedding space. We demonstrate that PSMT with a dynamic embedding space can\nachieve better prediction performance compared to static baselines. Our work\nestablishes that PSMT is an efficient unsupervised generative framework for\nprediction of future visual stimuli.\n","authors":["Yujia Xie","Xinhui Li","Vince D. Calhoun"],"pdf_url":"https://arxiv.org/pdf/2308.14207v1.pdf","comment":"Paper presented at the 1st Workshop on High-dimensional Learning\n Dynamics (HLD) at the 40th International Conference on Machine Learning\n (ICML) 2023, Honolulu, Hawaii, USA\n (https://sites.google.com/view/hidimlearning), 10 pages"},{"id":"http://arxiv.org/abs/2211.00646v2","updated":"2023-08-27T20:24:37Z","published":"2022-11-01T00:40:09Z","title":"Learning Melanocytic Cell Masks from Adjacent Stained Tissue","summary":" Melanoma is one of the most aggressive forms of skin cancer, causing a large\nproportion of skin cancer deaths. However, melanoma diagnoses by pathologists\nshows low interrater reliability. As melanoma is a cancer of the melanocyte,\nthere is a clear need to develop a melanocytic cell segmentation tool that is\nagnostic to pathologist variability and automates pixel-level annotation.\nGigapixel-level pathologist labeling, however, is impractical. Herein, we\npropose a means to train deep neural networks for melanocytic cell segmentation\nfrom hematoxylin and eosin (H&E) stained slides using paired\nimmunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean\nIOU of 0.64 despite imperfect ground-truth labels.\n","authors":["Mikio Tada","Maria L. Wei","Michael J. Keiser"],"pdf_url":"https://arxiv.org/pdf/2211.00646v2.pdf","comment":"{Medical Image Learning with Limited & Noisy Data Workshop at MICCAI\n 2022"},{"id":"http://arxiv.org/abs/2003.01052v6","updated":"2023-08-27T20:04:06Z","published":"2020-03-02T17:38:38Z","title":"How to choose the most appropriate centrality measure? A decision tree\n approach","summary":" Centrality metrics play a crucial role in network analysis, while the choice\nof specific measures significantly influences the accuracy of conclusions as\neach measure represents a unique concept of node importance. Among over 400\nproposed indices, selecting the most suitable ones for specific applications\nremains a challenge. Existing approaches -- model-based, data-driven, and\naxiomatic -- have limitations, requiring association with models, training\ndatasets, or restrictive axioms for each specific application. To address this,\nwe introduce the culling method, which relies on the expert concept of\ncentrality behavior on simple graphs. The culling method involves forming a set\nof candidate measures, generating a list of as small graphs as possible needed\nto distinguish the measures from each other, constructing a decision-tree\nsurvey, and identifying the measure consistent with the expert's concept. We\napply this approach to a diverse set of 40 centralities, including novel\nkernel-based indices, and combine it with the axiomatic approach. Remarkably,\nonly 13 small 1-trees are sufficient to separate all 40 measures, even for\npairs of closely related ones. By adopting simple ordinal axioms like\nSelf-consistency or Bridge axiom, the set of measures can be drastically\nreduced making the culling survey short. Applying the culling method provides\ninsightful findings on some centrality indices, such as PageRank, Bridging, and\ndissimilarity-based Eigencentrality measures, among others. The proposed\napproach offers a cost-effective solution in terms of labor and time,\ncomplementing existing methods for measure selection, and providing deeper\ninsights into the underlying mechanisms of centrality measures.\n","authors":["Pavel Chebotarev","Dmitry Gubanov"],"pdf_url":"https://arxiv.org/pdf/2003.01052v6.pdf","comment":"12 pages, 2 tables, 1 algorithm, 8 figures. Presentation has been\n improved"},{"id":"http://arxiv.org/abs/2308.14190v1","updated":"2023-08-27T19:43:43Z","published":"2023-08-27T19:43:43Z","title":"Score-Based Generative Models for PET Image Reconstruction","summary":" Score-based generative models have demonstrated highly promising results for\nmedical image reconstruction tasks in magnetic resonance imaging or computed\ntomography. However, their application to Positron Emission Tomography (PET) is\nstill largely unexplored. PET image reconstruction involves a variety of\nchallenges, including Poisson noise with high variance and a wide dynamic\nrange. To address these challenges, we propose several PET-specific adaptations\nof score-based generative models. The proposed framework is developed for both\n2D and 3D PET. In addition, we provide an extension to guided reconstruction\nusing magnetic resonance images. We validate the approach through extensive 2D\nand 3D $\\textit{in-silico}$ experiments with a model trained on\npatient-realistic data without lesions, and evaluate on data without lesions as\nwell as out-of-distribution data with lesions. This demonstrates the proposed\nmethod's robustness and significant potential for improved PET reconstruction.\n","authors":["Imraj RD Singh","Alexander Denker","Riccardo Barbano","Željko Kereta","Bangti Jin","Kris Thielemans","Peter Maass","Simon Arridge"],"pdf_url":"https://arxiv.org/pdf/2308.14190v1.pdf","comment":"35 pages, 16 figures, submitted to Journal of Machine Learning for\n Biomedical Imaging (MELBA)"},{"id":"http://arxiv.org/abs/2305.14704v3","updated":"2023-08-27T19:22:40Z","published":"2023-05-24T04:16:56Z","title":"Practical Batch Bayesian Sampling Algorithms for Online Adaptive Traffic\n Experimentation","summary":" Online controlled experiments have emerged as industry gold standard for\nassessing new web features. As new web algorithms proliferate, experimentation\nplatform faces an increasing demand on the velocity of online experiments,\nwhich encourages adaptive traffic testing methods to speed up identifying best\nvariant by efficiently allocating traffic. This paper proposed four Bayesian\nbatch bandit algorithms (NB-TS, WB-TS, NB-TTTS, WB-TTTS) for eBay's\nexperimentation platform, using summary batch statistics of a goal metric\nwithout incurring new engineering technical debts. The novel WB-TTTS, in\nparticular, demonstrates as an efficient, trustworthy and robust alternative to\nfixed horizon A/B testing. Another novel contribution is to bring\ntrustworthiness of best arm identification algorithms into evaluation criterion\nand highlight the existence of severe false positive inflation with equivalent\nbest arms. To gain the trust of experimenters, the experimentation platform\nmust consider both efficiency and trustworthiness; However, to the best of\nauthors' knowledge, trustworthiness as an important topic is rarely discussed\nin literatures of either best arm identification or multi-armed bandit. This\npaper shows that Bayesian bandits without neutral posterior reshaping,\nparticularly naive Thompson sampling (NB-TS), are untrustworthy because they\ncan always identify an arm as best from equivalent best arms. To restore\ntrustworthiness, a novel finding uncovers connections between convergence\ndistribution of posterior optimal probabilities of equivalent best arms and\nneutral posterior reshaping, which controls false positives. Lastly, this paper\npresents lessons learned from eBay's experience, as well as evaluations of the\nfour algorithms. We hope our work is useful to other industrial practitioners\nand inspire academic researchers interested in the trustworthiness of adaptive\ntraffic experimentation.\n","authors":["Zezhong Zhang","Ted Yuan"],"pdf_url":"https://arxiv.org/pdf/2305.14704v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14181v1","updated":"2023-08-27T19:01:29Z","published":"2023-08-27T19:01:29Z","title":"Topological Augmentation for Class-Imbalanced Node Classification","summary":" Class imbalance is prevalent in real-world node classification tasks and\noften biases graph learning models toward majority classes. Most existing\nstudies root from a node-centric perspective and aim to address the class\nimbalance in training data by node/class-wise reweighting or resampling. In\nthis paper, we approach the source of the class-imbalance bias from an\nunder-explored topology-centric perspective. Our investigation reveals that\nbeyond the inherently skewed training class distribution, the graph topology\nalso plays an important role in the formation of predictive bias: we identify\ntwo fundamental challenges, namely ambivalent and distant message-passing, that\ncan exacerbate the bias by aggravating majority-class over-generalization and\nminority-class misclassification. In light of these findings, we devise a\nlightweight topological augmentation method ToBA to dynamically rectify the\nnodes influenced by ambivalent/distant message-passing during graph learning,\nso as to mitigate the class-imbalance bias. We highlight that ToBA is a\nmodel-agnostic, efficient, and versatile solution that can be seamlessly\ncombined with and further boost other imbalance-handling techniques. Systematic\nexperiments validate the superior performance of ToBA in both promoting\nimbalanced node classification and mitigating the prediction bias between\ndifferent classes.\n","authors":["Zhining Liu","Zhichen Zeng","Ruizhong Qiu","Hyunsik Yoo","David Zhou","Zhe Xu","Yada Zhu","Kommy Weldemariam","Jingrui He","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2308.14181v1.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14175v1","updated":"2023-08-27T18:38:09Z","published":"2023-08-27T18:38:09Z","title":"Leveraging Linear Independence of Component Classifiers: Optimizing Size\n and Prediction Accuracy for Online Ensembles","summary":" Ensembles, which employ a set of classifiers to enhance classification\naccuracy collectively, are crucial in the era of big data. However, although\nthere is general agreement that the relation between ensemble size and its\nprediction accuracy, the exact nature of this relationship is still unknown. We\nintroduce a novel perspective, rooted in the linear independence of\nclassifier's votes, to analyze the interplay between ensemble size and\nprediction accuracy. This framework reveals a theoretical link, consequently\nproposing an ensemble size based on this relationship. Our study builds upon a\ngeometric framework and develops a series of theorems. These theorems clarify\nthe role of linear dependency in crafting ensembles. We present a method to\ndetermine the minimum ensemble size required to ensure a target probability of\nlinearly independent votes among component classifiers. Incorporating real and\nsynthetic datasets, our empirical results demonstrate a trend: increasing the\nnumber of classifiers enhances accuracy, as predicted by our theoretical\ninsights. However, we also identify a point of diminishing returns, beyond\nwhich additional classifiers provide diminishing improvements in accuracy.\nSurprisingly, the calculated ideal ensemble size deviates from empirical\nresults for certain datasets, emphasizing the influence of other factors. This\nstudy opens avenues for deeper investigations into the complex dynamics\ngoverning ensemble design and offers guidance for constructing efficient and\neffective ensembles in practical scenarios.\n","authors":["Enes Bektas","Fazli Can"],"pdf_url":"https://arxiv.org/pdf/2308.14175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14174v1","updated":"2023-08-27T18:35:46Z","published":"2023-08-27T18:35:46Z","title":"Integrated Approach of Gearbox Fault Diagnosis","summary":" Gearbox fault diagnosis is one of the most important parts in any industrial\nsystems. Failure of components inside gearbox can lead to a catastrophic\nfailure, uneven breakdown, and financial losses in industrial organization. In\nthat case intelligent maintenance of the gearbox comes into context. This paper\npresents an integrated gearbox fault diagnosis approach which can easily deploy\nin online condition monitoring. This work introduces a nonparametric data\npreprocessing technique i.e., calculus enhanced energy operator (CEEO) to\npreserve the characteristics frequencies in the noisy and inferred vibrational\nsignal. A set of time domain and spectral domain features are calculated from\nthe raw and CEEO vibration signal and inputted to the multiclass support vector\nmachine (MCSVM) to diagnose the faults on the system. An effective comparison\nbetween raw signal and CEEO signal are presented to show the impact of CEEO in\ngearbox fault diagnosis. The obtained results of this work look very promising\nand can be implemented in any type of industrial system due to its\nnonparametric nature.\n","authors":["Vikash Kumar","Subrata Mukherjee","Somnath Sarangi"],"pdf_url":"https://arxiv.org/pdf/2308.14174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14172v1","updated":"2023-08-27T18:28:58Z","published":"2023-08-27T18:28:58Z","title":"Hypergraph Structure Inference From Data Under Smoothness Prior","summary":" Hypergraphs are important for processing data with higher-order relationships\ninvolving more than two entities. In scenarios where explicit hypergraphs are\nnot readily available, it is desirable to infer a meaningful hypergraph\nstructure from the node features to capture the intrinsic relations within the\ndata. However, existing methods either adopt simple pre-defined rules that fail\nto precisely capture the distribution of the potential hypergraph structure, or\nlearn a mapping between hypergraph structures and node features but require a\nlarge amount of labelled data, i.e., pre-existing hypergraph structures, for\ntraining. Both restrict their applications in practical scenarios. To fill this\ngap, we propose a novel smoothness prior that enables us to design a method to\ninfer the probability for each potential hyperedge without labelled data as\nsupervision. The proposed prior indicates features of nodes in a hyperedge are\nhighly correlated by the features of the hyperedge containing them. We use this\nprior to derive the relation between the hypergraph structure and the node\nfeatures via probabilistic modelling. This allows us to develop an unsupervised\ninference method to estimate the probability for each potential hyperedge via\nsolving an optimisation problem that has an analytical solution. Experiments on\nboth synthetic and real-world data demonstrate that our method can learn\nmeaningful hypergraph structures from data more efficiently than existing\nhypergraph structure inference methods.\n","authors":["Bohan Tang","Siheng Chen","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2308.14172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08244v2","updated":"2023-08-27T18:23:01Z","published":"2022-11-14T03:51:12Z","title":"Artificial Intelligence for Automatic Detection and Classification\n Disease on the X-Ray Images","summary":" Detecting and classifying diseases using X-ray images is one of the more\nchallenging core tasks in the medical and research world. Due to the recent\nhigh interest in radiological images and AI, early detection of diseases in\nX-ray images has become notably more essential to prevent further spreading and\nflatten the curve. Innovations and revolutions of Computer Vision with Deep\nlearning methods offer great promise for fast and accurate diagnosis of\nscreening and detection from chest X-ray images (CXR). This work presents rapid\ndetection of diseases in the lung using the efficient Deep learning pre-trained\nRepVGG algorithm for deep feature extraction and classification. We used X-ray\nimages as an example to show the model's efficiency. To perform this task, we\nclassify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ\nROI object to improve the detection accuracy for lung extraction, followed by\ndata pre-processing and augmentation. We are applying Artificial Intelligence\ntechnology for automatic highlighted detection of affected areas of people's\nlungs. Based on the X-Ray images, an algorithm was developed that classifies\nX-Ray images with height accuracy and power faster thanks to the architecture\ntransformation of the model. We compared deep learning frameworks' accuracy and\ndetection of disease. The study shows the high power of deep learning methods\nfor X-ray images based on COVID-19 detection utilizing chest X-rays. The\nproposed framework offers better diagnostic accuracy by comparing popular deep\nlearning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and\nInceptionResnetV2.\n","authors":["Liora Mayats-Alpay"],"pdf_url":"https://arxiv.org/pdf/2211.08244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14165v1","updated":"2023-08-27T17:58:32Z","published":"2023-08-27T17:58:32Z","title":"Distributional Off-Policy Evaluation for Slate Recommendations","summary":" Recommendation strategies are typically evaluated by using previously logged\ndata, employing off-policy evaluation methods to estimate their expected\nperformance. However, for strategies that present users with slates of multiple\nitems, the resulting combinatorial action space renders many of these methods\nimpractical. Prior work has developed estimators that leverage the structure in\nslates to estimate the expected off-policy performance, but the estimation of\nthe entire performance distribution remains elusive. Estimating the complete\ndistribution allows for a more comprehensive evaluation of recommendation\nstrategies, particularly along the axes of risk and fairness that employ\nmetrics computable from the distribution. In this paper, we propose an\nestimator for the complete off-policy performance distribution for slates and\nestablish conditions under which the estimator is unbiased and consistent. This\nbuilds upon prior work on off-policy evaluation for slates and off-policy\ndistribution estimation in reinforcement learning. We validate the efficacy of\nour method empirically on synthetic data as well as on a slate recommendation\nsimulator constructed from real-world data (MovieLens-20M). Our results show a\nsignificant reduction in estimation variance and improved sample efficiency\nover prior work across a range of slate structures.\n","authors":["Shreyas Chaudhari","David Arbour","Georgios Theocharous","Nikos Vlassis"],"pdf_url":"https://arxiv.org/pdf/2308.14165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14163v1","updated":"2023-08-27T17:47:30Z","published":"2023-08-27T17:47:30Z","title":"Explaining with Attribute-based and Relational Near Misses: An\n Interpretable Approach to Distinguishing Facial Expressions of Pain and\n Disgust","summary":" Explaining concepts by contrasting examples is an efficient and convenient\nway of giving insights into the reasons behind a classification decision. This\nis of particular interest in decision-critical domains, such as medical\ndiagnostics. One particular challenging use case is to distinguish facial\nexpressions of pain and other states, such as disgust, due to high similarity\nof manifestation. In this paper, we present an approach for generating\ncontrastive explanations to explain facial expressions of pain and disgust\nshown in video sequences. We implement and compare two approaches for\ncontrastive explanation generation. The first approach explains a specific pain\ninstance in contrast to the most similar disgust instance(s) based on the\noccurrence of facial expressions (attributes). The second approach takes into\naccount which temporal relations hold between intervals of facial expressions\nwithin a sequence (relations). The input to our explanation generation approach\nis the output of an interpretable rule-based classifier for pain and disgust.We\nutilize two different similarity metrics to determine near misses and far\nmisses as contrasting instances. Our results show that near miss explanations\nare shorter than far miss explanations, independent from the applied similarity\nmetric. The outcome of our evaluation indicates that pain and disgust can be\ndistinguished with the help of temporal relations. We currently plan\nexperiments to evaluate how the explanations help in teaching concepts and how\nthey could be enhanced by further modalities and interaction.\n","authors":["Bettina Finzel","Simon P. Kuhn","David E. Tafler","Ute Schmid"],"pdf_url":"https://arxiv.org/pdf/2308.14163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12856v3","updated":"2023-08-27T16:46:38Z","published":"2022-08-26T20:08:40Z","title":"Local Context-Aware Active Domain Adaptation","summary":" Active Domain Adaptation (ADA) queries the labels of a small number of\nselected target samples to help adapting a model from a source domain to a\ntarget domain. The local context of queried data is important, especially when\nthe domain gap is large. However, this has not been fully explored by existing\nADA works. In this paper, we propose a Local context-aware ADA framework, named\nLADA, to address this issue. To select informative target samples, we devise a\nnovel criterion based on the local inconsistency of model predictions. Since\nthe labeling budget is usually small, fine-tuning model on only queried data\ncan be inefficient. We progressively augment labeled target data with the\nconfident neighbors in a class-balanced manner. Experiments validate that the\nproposed criterion chooses more informative target samples than existing active\nselection strategies. Furthermore, our full method clearly surpasses recent ADA\narts on various benchmarks. Code is available at https://github.com/tsun/LADA.\n","authors":["Tao Sun","Cheng Lu","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2208.12856v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14144v1","updated":"2023-08-27T15:57:08Z","published":"2023-08-27T15:57:08Z","title":"Learning end-to-end inversion of circular Radon transforms in the\n partial radial setup","summary":" We present a deep learning-based computational algorithm for inversion of\ncircular Radon transforms in the partial radial setup, arising in photoacoustic\ntomography. We first demonstrate that the truncated singular value\ndecomposition-based method, which is the only traditional algorithm available\nto solve this problem, leads to severe artifacts which renders the\nreconstructed field as unusable. With the objective of overcoming this\ncomputational bottleneck, we train a ResBlock based U-Net to recover the\ninferred field that directly operates on the measured data. Numerical results\nwith augmented Shepp-Logan phantoms, in the presence of noisy full and limited\nview data, demonstrate the superiority of the proposed algorithm.\n","authors":["Deep Ray","Souvik Roy"],"pdf_url":"https://arxiv.org/pdf/2308.14144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14142v1","updated":"2023-08-27T15:44:28Z","published":"2023-08-27T15:44:28Z","title":"Integrated Variational Fourier Features for Fast Spatial Modelling with\n Gaussian Processes","summary":" Sparse variational approximations are popular methods for scaling up\ninference and learning in Gaussian processes to larger datasets. For $N$\ntraining points, exact inference has $O(N^3)$ cost; with $M \\ll N$ features,\nstate of the art sparse variational methods have $O(NM^2)$ cost. Recently,\nmethods have been proposed using more sophisticated features; these promise\n$O(M^3)$ cost, with good performance in low dimensional tasks such as spatial\nmodelling, but they only work with a very limited class of kernels, excluding\nsome of the most commonly used. In this work, we propose integrated Fourier\nfeatures, which extends these performance benefits to a very broad class of\nstationary covariance functions. We motivate the method and choice of\nparameters from a convergence analysis and empirical exploration, and show\npractical speedup in synthetic and real world spatial regression tasks.\n","authors":["Talay M Cheema","Carl Edward Rasmussen"],"pdf_url":"https://arxiv.org/pdf/2308.14142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14132v1","updated":"2023-08-27T15:20:06Z","published":"2023-08-27T15:20:06Z","title":"Detecting Language Model Attacks with Perplexity","summary":" A novel hack involving Large Language Models (LLMs) has emerged, leveraging\nadversarial suffixes to trick models into generating perilous responses. This\nmethod has garnered considerable attention from reputable media outlets such as\nthe New York Times and Wired, thereby influencing public perception regarding\nthe security and safety of LLMs. In this study, we advocate the utilization of\nperplexity as one of the means to recognize such potential attacks. The\nunderlying concept behind these hacks revolves around appending an unusually\nconstructed string of text to a harmful query that would otherwise be blocked.\nThis maneuver confuses the protective mechanisms and tricks the model into\ngenerating a forbidden response. Such scenarios could result in providing\ndetailed instructions to a malicious user for constructing explosives or\norchestrating a bank heist. Our investigation demonstrates the feasibility of\nemploying perplexity, a prevalent natural language processing metric, to detect\nthese adversarial tactics before generating a forbidden response. By evaluating\nthe perplexity of queries with and without such adversarial suffixes using an\nopen-source LLM, we discovered that nearly 90 percent were above a perplexity\nof 1000. This contrast underscores the efficacy of perplexity for detecting\nthis type of exploit.\n","authors":["Gabriel Alon","Michael Kamfonas"],"pdf_url":"https://arxiv.org/pdf/2308.14132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14129v1","updated":"2023-08-27T15:11:44Z","published":"2023-08-27T15:11:44Z","title":"SPEED: Streaming Partition and Parallel Acceleration for Temporal\n Interaction Graph Embedding","summary":" Temporal Interaction Graphs (TIGs) are widely employed to model intricate\nreal-world systems such as financial systems and social networks. To capture\nthe dynamism and interdependencies of nodes, existing TIG embedding models need\nto process edges sequentially and chronologically. However, this requirement\nprevents it from being processed in parallel and struggle to accommodate\nburgeoning data volumes to GPU. Consequently, many large-scale temporal\ninteraction graphs are confined to CPU processing. Furthermore, a generalized\nGPU scaling and acceleration approach remains unavailable. To facilitate\nlarge-scale TIGs' implementation on GPUs for acceleration, we introduce a novel\ntraining approach namely Streaming Edge Partitioning and Parallel Acceleration\nfor Temporal Interaction Graph Embedding (SPEED). The SPEED is comprised of a\nStreaming Edge Partitioning Component (SEP) which addresses space overhead\nissue by assigning fewer nodes to each GPU, and a Parallel Acceleration\nComponent (PAC) which enables simultaneous training of different sub-graphs,\naddressing time overhead issue. Our method can achieve a good balance in\ncomputing resources, computing time, and downstream task performance. Empirical\nvalidation across 7 real-world datasets demonstrates the potential to expedite\ntraining speeds by a factor of up to 19.29x. Simultaneously, resource\nconsumption of a single-GPU can be diminished by up to 69%, thus enabling the\nmultiple GPU-based training and acceleration encompassing millions of nodes and\nbillions of edges. Furthermore, our approach also maintains its competitiveness\nin downstream tasks.\n","authors":["Xi Chen","Yongxiang Liao","Yun Xiong","Yao Zhang","Siwei Zhang","Jiawei Zhang","Yiheng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14129v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2206.04678v3","updated":"2023-08-27T14:35:43Z","published":"2022-06-08T17:19:55Z","title":"ReCo: A Dataset for Residential Community Layout Planning","summary":" Layout planning is centrally important in the field of architecture and urban\ndesign. Among the various basic units carrying urban functions, residential\ncommunity plays a vital part for supporting human life. Therefore, the layout\nplanning of residential community has always been of concern, and has attracted\nparticular attention since the advent of deep learning that facilitates the\nautomated layout generation and spatial pattern recognition. However, the\nresearch circles generally suffer from the insufficiency of residential\ncommunity layout benchmark or high-quality datasets, which hampers the future\nexploration of data-driven methods for residential community layout planning.\nThe lack of datasets is largely due to the difficulties of large-scale\nreal-world residential data acquisition and long-term expert screening. In\norder to address the issues and advance a benchmark dataset for various\nintelligent spatial design and analysis applications in the development of\nsmart city, we introduce Residential Community Layout Planning (ReCo) Dataset,\nwhich is the first and largest open-source vector dataset related to real-world\ncommunity to date. ReCo Dataset is presented in multiple data formats with\n37,646 residential community layout plans, covering 598,728 residential\nbuildings with height information. ReCo can be conveniently adapted for\nresidential community layout related urban design tasks, e.g., generative\nlayout design, morphological pattern recognition and spatial evaluation. To\nvalidate the utility of ReCo in automated residential community layout\nplanning, two Generative Adversarial Network (GAN) based generative models are\nfurther applied to the dataset. We expect ReCo Dataset to inspire more creative\nand practical work in intelligent design and beyond. The ReCo Dataset is\npublished at: https://www.kaggle.com/fdudsde/reco-dataset.\n","authors":["Xi Chen","Yun Xiong","Siqi Wang","Haofen Wang","Tao Sheng","Yao Zhang","Yu Ye"],"pdf_url":"https://arxiv.org/pdf/2206.04678v3.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14120v1","updated":"2023-08-27T14:28:38Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap\nand perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT CI without specific guidance. ChatGPT CI autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT CI offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14119v1","updated":"2023-08-27T14:25:07Z","published":"2023-08-27T14:25:07Z","title":"Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario","summary":" Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to\nimprove model performance. Traditional SSL methods assume that labeled and\nunlabeled data share the same label space. However, in real-world applications,\nespecially when the labeled training set is small, there may be classes that\nare missing from the labeled set. Existing frameworks aim to either reject all\nunseen classes (open-set SSL) or to discover unseen classes by partitioning an\nunlabeled set during training (open-world SSL). In our work, we construct a\nclassifier for points from both seen and unseen classes. Our approach is based\non extending an existing SSL method, such as FlexMatch, by incorporating an\nadditional entropy loss. This enhancement allows our method to improve the\nperformance of any existing SSL method in the classification of both seen and\nunseen classes. We demonstrate large improvement gains over state-of-the-art\nSSL, open-set SSL, and open-world SSL methods, on two benchmark image\nclassification data sets, CIFAR-100 and STL-10. The gains are most pronounced\nwhen the labeled data is severely limited (1-25 labeled examples per class).\n","authors":["Noam Fluss","Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14114v1","updated":"2023-08-27T14:13:29Z","published":"2023-08-27T14:13:29Z","title":"Hybrid Transformer-RNN Architecture for Household Occupancy Detection\n Using Low-Resolution Smart Meter Data","summary":" Residential occupancy detection has become an enabling technology in today's\nurbanized world for various smart home applications, such as building\nautomation, energy management, and improved security and comfort.\nDigitalization of the energy system provides smart meter data that can be used\nfor occupancy detection in a non-intrusive manner without causing concerns\nregarding privacy and data security. In particular, deep learning techniques\nmake it possible to infer occupancy from low-resolution smart meter data, such\nthat the need for accurate occupancy detection with privacy preservation can be\nachieved. Our work is thus motivated to develop a privacy-aware and effective\nmodel for residential occupancy detection in contemporary living environments.\nOur model aims to leverage the advantages of both recurrent neural networks\n(RNNs), which are adept at capturing local temporal dependencies, and\ntransformers, which are effective at handling global temporal dependencies. Our\ndesigned hybrid transformer-RNN model detects residential occupancy using\nhourly smart meter data, achieving an accuracy of nearly 92\\% across households\nwith diverse profiles. We validate the effectiveness of our method using a\npublicly accessible dataset and demonstrate its performance by comparing it\nwith state-of-the-art models, including attention-based occupancy detection\nmethods.\n","authors":["Xinyu Liang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14114v1.pdf","comment":"IEEE IECON 2023 (The 49th Annual Conference of the IEEE Industrial\n Electronics Society)"},{"id":"http://arxiv.org/abs/2308.14108v1","updated":"2023-08-27T13:50:15Z","published":"2023-08-27T13:50:15Z","title":"Depth self-supervision for single image novel view synthesis","summary":" In this paper, we tackle the problem of generating a novel image from an\narbitrary viewpoint given a single frame as input. While existing methods\noperating in this setup aim at predicting the target view depth map to guide\nthe synthesis, without explicit supervision over such a task, we jointly\noptimize our framework for both novel view synthesis and depth estimation to\nunleash the synergy between the two at its best. Specifically, a shared depth\ndecoder is trained in a self-supervised manner to predict depth maps that are\nconsistent across the source and target views. Our results demonstrate the\neffectiveness of our approach in addressing the challenges of both tasks\nallowing for higher-quality generated images, as well as more accurate depth\nfor the target viewpoint.\n","authors":["Giovanni Minelli","Matteo Poggi","Samuele Salti"],"pdf_url":"https://arxiv.org/pdf/2308.14108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14104v1","updated":"2023-08-27T13:22:50Z","published":"2023-08-27T13:22:50Z","title":"Towards Generalizable Neural Solvers for Vehicle Routing Problems via\n Ensemble with Transferrable Local Policy","summary":" Machine learning has been adapted to help solve NP-hard combinatorial\noptimization problems. One prevalent way is learning to construct solutions by\ndeep neural networks, which has been receiving more and more attention due to\nthe high efficiency and less requirement for expert knowledge. However, many\nneural construction methods for Vehicle Routing Problems (VRPs) focus on\nsynthetic problem instances with limited scales and specified node\ndistributions, leading to poor performance on real-world problems which usually\ninvolve large scales together with complex and unknown node distributions. To\nmake neural VRP solvers more practical in real-world scenarios, we design an\nauxiliary policy that learns from the local transferable topological features,\nnamed local policy, and integrate it with a typical constructive policy (which\nlearns from the global information of VRP instances) to form an ensemble\npolicy. With joint training, the aggregated policies perform cooperatively and\ncomplementarily to boost generalization. The experimental results on two\nwell-known benchmarks, TSPLIB and CVRPLIB, of travelling salesman problem and\ncapacitated VRP show that the ensemble policy consistently achieves better\ngeneralization than state-of-the-art construction methods and even works well\non real-world problems with several thousand nodes.\n","authors":["Chengrui Gao","Haopu Shang","Ke Xue","Dong Li","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2308.14104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11391v2","updated":"2023-08-27T13:12:30Z","published":"2023-05-19T02:41:12Z","title":"A Survey of Safety and Trustworthiness of Large Language Models through\n the Lens of Verification and Validation","summary":" Large Language Models (LLMs) have exploded a new heatwave of AI for their\nability to engage end-users in human-level conversations with detailed and\narticulate answers across many knowledge domains. In response to their fast\nadoption in many industrial applications, this survey concerns their safety and\ntrustworthiness. First, we review known vulnerabilities and limitations of the\nLLMs, categorising them into inherent issues, attacks, and unintended bugs.\nThen, we consider if and how the Verification and Validation (V&V) techniques,\nwhich have been widely developed for traditional software and deep learning\nmodels such as convolutional neural networks as independent processes to check\nthe alignment of their implementations against the specifications, can be\nintegrated and further extended throughout the lifecycle of the LLMs to provide\nrigorous analysis to the safety and trustworthiness of LLMs and their\napplications. Specifically, we consider four complementary techniques:\nfalsification and evaluation, verification, runtime monitoring, and regulations\nand ethical use. In total, 370+ references are considered to support the quick\nunderstanding of the safety and trustworthiness issues from the perspective of\nV&V. While intensive research has been conducted to identify the safety and\ntrustworthiness issues, rigorous yet practical methods are called for to ensure\nthe alignment of LLMs with safety and trustworthiness requirements.\n","authors":["Xiaowei Huang","Wenjie Ruan","Wei Huang","Gaojie Jin","Yi Dong","Changshun Wu","Saddek Bensalem","Ronghui Mu","Yi Qi","Xingyu Zhao","Kaiwen Cai","Yanghao Zhang","Sihao Wu","Peipei Xu","Dengyu Wu","Andre Freitas","Mustafa A. Mustafa"],"pdf_url":"https://arxiv.org/pdf/2305.11391v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14093v1","updated":"2023-08-27T12:35:38Z","published":"2023-08-27T12:35:38Z","title":"The inverse problem for neural networks","summary":" We study the problem of computing the preimage of a set under a neural\nnetwork with piecewise-affine activation functions. We recall an old result\nthat the preimage of a polyhedral set is again a union of polyhedral sets and\ncan be effectively computed. We show several applications of computing the\npreimage for analysis and interpretability of neural networks.\n","authors":["Marcelo Forets","Christian Schilling"],"pdf_url":"https://arxiv.org/pdf/2308.14093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14089v1","updated":"2023-08-27T12:24:39Z","published":"2023-08-27T12:24:39Z","title":"MedAlign: A Clinician-Generated Dataset for Instruction Following with\n Electronic Medical Records","summary":" The ability of large language models (LLMs) to follow natural language\ninstructions with human-level fluency suggests many opportunities in healthcare\nto reduce administrative burden and improve quality of care. However,\nevaluating LLMs on realistic text generation tasks for healthcare remains\nchallenging. Existing question answering datasets for electronic health record\n(EHR) data fail to capture the complexity of information needs and\ndocumentation burdens experienced by clinicians. To address these challenges,\nwe introduce MedAlign, a benchmark dataset of 983 natural language instructions\nfor EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes\nclinician-written reference responses for 303 instructions, and provides 276\nlongitudinal EHRs for grounding instruction-response pairs. We used MedAlign to\nevaluate 6 general domain LLMs, having clinicians rank the accuracy and quality\nof each LLM response. We found high error rates, ranging from 35% (GPT-4) to\n68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k\ncontext lengths for GPT-4. Finally, we report correlations between clinician\nrankings and automated natural language generation metrics as a way to rank\nLLMs without human review. We make MedAlign available under a research data use\nagreement to enable LLM evaluations on tasks aligned with clinician needs and\npreferences.\n","authors":["Scott L. Fleming","Alejandro Lozano","William J. Haberkorn","Jenelle A. Jindal","Eduardo P. Reis","Rahul Thapa","Louis Blankemeier","Julian Z. Genkins","Ethan Steinberg","Ashwin Nayak","Birju S. Patel","Chia-Chun Chiang","Alison Callahan","Zepeng Huo","Sergios Gatidis","Scott J. Adams","Oluseyi Fayanju","Shreya J. Shah","Thomas Savage","Ethan Goh","Akshay S. Chaudhari","Nima Aghaeepour","Christopher Sharp","Michael A. Pfeffer","Percy Liang","Jonathan H. Chen","Keith E. Morse","Emma P. Brunskill","Jason A. Fries","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14085v1","updated":"2023-08-27T12:16:33Z","published":"2023-08-27T12:16:33Z","title":"Sampling with flows, diffusion and autoregressive neural networks: A\n spin-glass perspective","summary":" Recent years witnessed the development of powerful generative models based on\nflows, diffusion or autoregressive neural networks, achieving remarkable\nsuccess in generating data from examples with applications in a broad range of\nareas. A theoretical analysis of the performance and understanding of the\nlimitations of these methods remain, however, challenging. In this paper, we\nundertake a step in this direction by analysing the efficiency of sampling by\nthese methods on a class of problems with a known probability distribution and\ncomparing it with the sampling performance of more traditional methods such as\nthe Monte Carlo Markov chain and Langevin dynamics. We focus on a class of\nprobability distribution widely studied in the statistical physics of\ndisordered systems that relate to spin glasses, statistical inference and\nconstraint satisfaction problems.\n We leverage the fact that sampling via flow-based, diffusion-based or\nautoregressive networks methods can be equivalently mapped to the analysis of a\nBayes optimal denoising of a modified probability measure. Our findings\ndemonstrate that these methods encounter difficulties in sampling stemming from\nthe presence of a first-order phase transition along the algorithm's denoising\npath. Our conclusions go both ways: we identify regions of parameters where\nthese methods are unable to sample efficiently, while that is possible using\nstandard Monte Carlo or Langevin approaches. We also identify regions where the\nopposite happens: standard approaches are inefficient while the discussed\ngenerative methods work well.\n","authors":["Davide Ghio","Yatin Dandi","Florent Krzakala","Lenka Zdeborová"],"pdf_url":"https://arxiv.org/pdf/2308.14085v1.pdf","comment":"39 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.03953v2","updated":"2023-08-27T11:27:13Z","published":"2023-08-07T23:44:35Z","title":"PMU measurements based short-term voltage stability assessment of power\n systems via deep transfer learning","summary":" Deep learning has emerged as an effective solution for addressing the\nchallenges of short-term voltage stability assessment (STVSA) in power systems.\nHowever, existing deep learning-based STVSA approaches face limitations in\nadapting to topological changes, sample labeling, and handling small datasets.\nTo overcome these challenges, this paper proposes a novel phasor measurement\nunit (PMU) measurements-based STVSA method by using deep transfer learning. The\nmethod leverages the real-time dynamic information captured by PMUs to create\nan initial dataset. It employs temporal ensembling for sample labeling and\nutilizes least squares generative adversarial networks (LSGAN) for data\naugmentation, enabling effective deep learning on small-scale datasets.\nAdditionally, the method enhances adaptability to topological changes by\nexploring connections between different faults. Experimental results on the\nIEEE 39-bus test system demonstrate that the proposed method improves model\nevaluation accuracy by approximately 20% through transfer learning, exhibiting\nstrong adaptability to topological changes. Leveraging the self-attention\nmechanism of the Transformer model, this approach offers significant advantages\nover shallow learning methods and other deep learning-based approaches.\n","authors":["Yang Li","Shitu Zhang","Yuanzheng Li","Jiting Cao","Shuyue Jia"],"pdf_url":"https://arxiv.org/pdf/2308.03953v2.pdf","comment":"Accepted by IEEE Transactions on Instrumentation & Measurement"},{"id":"http://arxiv.org/abs/2308.14058v1","updated":"2023-08-27T09:45:41Z","published":"2023-08-27T09:45:41Z","title":"Pruning the Unlabeled Data to Improve Semi-Supervised Learning","summary":" In the domain of semi-supervised learning (SSL), the conventional approach\ninvolves training a learner with a limited amount of labeled data alongside a\nsubstantial volume of unlabeled data, both drawn from the same underlying\ndistribution. However, for deep learning models, this standard practice may not\nyield optimal results. In this research, we propose an alternative perspective,\nsuggesting that distributions that are more readily separable could offer\nsuperior benefits to the learner as compared to the original distribution. To\nachieve this, we present PruneSSL, a practical technique for selectively\nremoving examples from the original unlabeled dataset to enhance its\nseparability. We present an empirical study, showing that although PruneSSL\nreduces the quantity of available training data for the learner, it\nsignificantly improves the performance of various competitive SSL algorithms,\nthereby achieving state-of-the-art results across several image classification\ntasks.\n","authors":["Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14058v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.15984v2","updated":"2023-08-27T12:45:33Z","published":"2023-07-29T13:12:40Z","title":"VATP360: Viewport Adaptive 360-Degree Video Streaming based on Tile\n Priority","summary":" 360-degree video becomes increasingly popular among users. In the current\nnetwork bandwidth, serving high resolution 360 degree video to users is quite\ndifficult. Most of the work has been devoted to the prediction of user\nviewports or tile-based adaptive algorithms. However, it is difficult to\npredict user viewports more accurately using only information such as user's\nhistorical viewports or video saliency maps. In this paper, we propose a\nviewport adaptive 360-degree video streaming method based on tile priority\n(VATP360), which tries to balance between the performance and the overhead. The\nproposed VATP360 consists of three main modules: viewport prediction, tile\npriority classification and bitrate allocation. In the viewport prediction\nmodule, object motion trajectory and predicted user's region-of-interest (ROI)\nare used to achieve accurate prediction of the user's future viewport. Then,\nthe predicted viewport, along with the object motion trajectory, are fed into\nthe proposed tile priority classification algorithm to assign different\npriorities to tiles, which would reduce the computational complexity of the\nbitrate allocation module. Finally in the bitrate allocation stage, we\nadaptively assign bitrates to tiles of different priority by reinforcement\nlearning. Experimental results on publicly available datasets have demonstrated\nthe effectiveness of the proposed method.\n","authors":["Zhiyu Pang"],"pdf_url":"https://arxiv.org/pdf/2307.15984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13998v1","updated":"2023-08-27T03:55:28Z","published":"2023-08-27T03:55:28Z","title":"Computation-efficient Deep Learning for Computer Vision: A Survey","summary":" Over the past decade, deep learning models have exhibited considerable\nadvancements, reaching or even exceeding human-level performance in a range of\nvisual perception tasks. This remarkable progress has sparked interest in\napplying deep networks to real-world applications, such as autonomous vehicles,\nmobile devices, robotics, and edge computing. However, the challenge remains\nthat state-of-the-art models usually demand significant computational\nresources, leading to impractical power consumption, latency, or carbon\nemissions in real-world scenarios. This trade-off between effectiveness and\nefficiency has catalyzed the emergence of a new research focus: computationally\nefficient deep learning, which strives to achieve satisfactory performance\nwhile minimizing the computational cost during inference. This review offers an\nextensive analysis of this rapidly evolving field by examining four key areas:\n1) the development of static or dynamic light-weighted backbone models for the\nefficient extraction of discriminative deep representations; 2) the specialized\nnetwork architectures or algorithms tailored for specific computer vision\ntasks; 3) the techniques employed for compressing deep learning models; and 4)\nthe strategies for deploying efficient deep networks on hardware platforms.\nAdditionally, we provide a systematic discussion on the critical challenges\nfaced in this domain, such as network architecture design, training schemes,\npractical efficiency, and more realistic model compression approaches, as well\nas potential future research directions.\n","authors":["Yulin Wang","Yizeng Han","Chaofei Wang","Shiji Song","Qi Tian","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.13998v1.pdf","comment":null}]},"2023-08-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.13961v1","updated":"2023-08-26T21:38:31Z","published":"2023-08-26T21:38:31Z","title":"Translate Meanings, Not Just Words: IdiomKB's Role in Optimizing\n Idiomatic Translation with Language Models","summary":" To translate well, machine translation (MT) systems and general-purposed\nlanguage models (LMs) need a deep understanding of both source and target\nlanguages and cultures. Therefore, idioms, with their non-compositional nature,\npose particular challenges for Transformer-based systems, as literal\ntranslations often miss the intended meaning. Traditional methods, which\nreplace idioms using existing knowledge bases (KBs), often lack scale and\ncontext awareness. Addressing these challenges, our approach prioritizes\ncontext awareness and scalability, allowing for offline storage of idioms in a\nmanageable KB size. This ensures efficient serving with smaller models and\nprovides a more comprehensive understanding of idiomatic expressions. We\nintroduce a multilingual idiom KB (IdiomKB) developed using large LMs to\naddress this. This KB facilitates better translation by smaller models, such as\nBLOOMZ (7.1B), Alpaca (7B), and InstructGPT (6.7B), by retrieving idioms'\nfigurative meanings. We present a novel, GPT-4-powered metric for human-aligned\nevaluation, demonstrating that IdiomKB considerably boosts model performance.\nHuman evaluations further validate our KB's quality.\n","authors":["Shuang Li","Jiangjie Chen","Siyu Yuan","Xinyi Wu","Hao Yang","Shimin Tao","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.13961v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.13958v1","updated":"2023-08-26T20:59:21Z","published":"2023-08-26T20:59:21Z","title":"Improving Knowledge Distillation for BERT Models: Loss Functions,\n Mapping Methods, and Weight Tuning","summary":" The use of large transformer-based models such as BERT, GPT, and T5 has led\nto significant advancements in natural language processing. However, these\nmodels are computationally expensive, necessitating model compression\ntechniques that reduce their size and complexity while maintaining accuracy.\nThis project investigates and applies knowledge distillation for BERT model\ncompression, specifically focusing on the TinyBERT student model. We explore\nvarious techniques to improve knowledge distillation, including experimentation\nwith loss functions, transformer layer mapping methods, and tuning the weights\nof attention and representation loss and evaluate our proposed techniques on a\nselection of downstream tasks from the GLUE benchmark. The goal of this work is\nto improve the efficiency and effectiveness of knowledge distillation, enabling\nthe development of more efficient and accurate models for a range of natural\nlanguage processing tasks.\n","authors":["Apoorv Dankar","Adeem Jassani","Kartikaeya Kumar"],"pdf_url":"https://arxiv.org/pdf/2308.13958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12057v2","updated":"2023-08-26T19:29:03Z","published":"2023-07-05T17:05:32Z","title":"External Reasoning: Towards Multi-Large-Language-Models Interchangeable\n Assistance with Human Feedback","summary":" Memory is identified as a crucial human faculty that allows for the retention\nof visual and linguistic information within the hippocampus and neurons in the\nbrain, which can subsequently be retrieved to address real-world challenges\nthat arise through a lifetime of learning. The resolution of complex AI tasks\nthrough the application of acquired knowledge represents a stride toward the\nrealization of artificial general intelligence. However, despite the prevalence\nof Large Language Models (LLMs) like GPT-3.5 and GPT-4 \\cite{brown2020language,\nleiter2023chatgpt, zaitsu2023distinguishing, OpenAI2023GPT4TR} , which have\ndisplayed remarkable capabilities in language comprehension, generation,\ninteraction, and reasoning, they are inhibited by constraints on context length\nthat preclude the processing of extensive, continually evolving knowledge\nbases. This paper proposes that LLMs could be augmented through the selective\nintegration of knowledge from external repositories, and in doing so,\nintroduces a novel methodology for External Reasoning, exemplified by ChatPDF.\nCentral to this approach is the establishment of a tiered policy for\n\\textbf{External Reasoning based on Multiple LLM Interchange Assistance} in\n\\cref{fig:overall}, where the level of support rendered is modulated across\nentry, intermediate, and advanced tiers based on the complexity of the query,\nwith adjustments made in response to human feedback. A comprehensive evaluation\nof this methodology is conducted using multiple LLMs and the results indicate\nstate-of-the-art performance in \\cref{comparison} , surpassing existing\nsolutions including ChatPDF.com. Moreover, the paper emphasizes that this\napproach is more efficient compared to the direct processing of full text by\nLLMs. The source code is publicly available at:\n\\url{https://github.com/AkideLiu/ANLP}.\n","authors":["Akide Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12057v2.pdf","comment":"technical report, add code link. arXiv admin note: text overlap with\n arXiv:2305.11206 by other authors"},{"id":"http://arxiv.org/abs/2307.02758v2","updated":"2023-08-26T16:56:27Z","published":"2023-07-06T03:43:45Z","title":"Exploring Linguistic Style Matching in Online Communities: The Role of\n Social Context and Conversation Dynamics","summary":" Linguistic style matching (LSM) in conversations can be reflective of several\naspects of social influence such as power or persuasion. However, how LSM\nrelates to the outcomes of online communication on platforms such as Reddit is\nan unknown question. In this study, we analyze a large corpus of two-party\nconversation threads in Reddit where we identify all occurrences of LSM using\ntwo types of style: the use of function words and formality. Using this\nframework, we examine how levels of LSM differ in conversations depending on\nseveral social factors within Reddit: post and subreddit features, conversation\ndepth, user tenure, and the controversiality of a comment. Finally, we measure\nthe change of LSM following loss of status after community banning. Our\nfindings reveal the interplay of LSM in Reddit conversations with several\ncommunity metrics, suggesting the importance of understanding conversation\nengagement when understanding community dynamics.\n","authors":["Aparna Ananthasubramaniam","Hong Chen","Jason Yan","Kenan Alkiek","Jiaxin Pei","Agrima Seth","Lavinia Dunagan","Minje Choi","Benjamin Litterer","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2307.02758v2.pdf","comment":"Equal contributions from authors 1-9 (AA, HC, JY, KA, JP, AS, LD, MC,\n BL)"},{"id":"http://arxiv.org/abs/2308.13916v1","updated":"2023-08-26T16:51:17Z","published":"2023-08-26T16:51:17Z","title":"Exploring Large Language Models for Knowledge Graph Completion","summary":" Knowledge graphs play a vital role in numerous artificial intelligence tasks,\nyet they frequently face the issue of incompleteness. In this study, we explore\nutilizing Large Language Models (LLM) for knowledge graph completion. We\nconsider triples in knowledge graphs as text sequences and introduce an\ninnovative framework called Knowledge Graph LLM (KG-LLM) to model these\ntriples. Our technique employs entity and relation descriptions of a triple as\nprompts and utilizes the response for predictions. Experiments on various\nbenchmark knowledge graphs demonstrate that our method attains state-of-the-art\nperformance in tasks such as triple classification and relation prediction. We\nalso find that fine-tuning relatively smaller models (e.g., LLaMA-7B,\nChatGLM-6B) outperforms recent ChatGPT and GPT-4.\n","authors":["Liang Yao","Jiazhen Peng","Chengsheng Mao","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2308.13916v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.13911v1","updated":"2023-08-26T16:10:30Z","published":"2023-08-26T16:10:30Z","title":"A Wide Evaluation of ChatGPT on Affective Computing Tasks","summary":" With the rise of foundation models, a new artificial intelligence paradigm\nhas emerged, by simply using general purpose foundation models with prompting\nto solve problems instead of training a separate machine learning model for\neach problem. Such models have been shown to have emergent properties of\nsolving problems that they were not initially trained on. The studies for the\neffectiveness of such models are still quite limited. In this work, we widely\nstudy the capabilities of the ChatGPT models, namely GPT-4 and GPT-3.5, on 13\naffective computing problems, namely aspect extraction, aspect polarity\nclassification, opinion extraction, sentiment analysis, sentiment intensity\nranking, emotions intensity ranking, suicide tendency detection, toxicity\ndetection, well-being assessment, engagement measurement, personality\nassessment, sarcasm detection, and subjectivity detection. We introduce a\nframework to evaluate the ChatGPT models on regression-based problems, such as\nintensity ranking problems, by modelling them as pairwise ranking\nclassification. We compare ChatGPT against more traditional NLP methods, such\nas end-to-end recurrent neural networks and transformers. The results\ndemonstrate the emergent abilities of the ChatGPT models on a wide range of\naffective computing problems, where GPT-3.5 and especially GPT-4 have shown\nstrong performance on many problems, particularly the ones related to\nsentiment, emotions, or toxicity. The ChatGPT models fell short for problems\nwith implicit signals, such as engagement measurement and subjectivity\ndetection.\n","authors":["Mostafa M. Amin","Rui Mao","Erik Cambria","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2308.13911v1.pdf","comment":"8 pages with references, 2 tables"},{"id":"http://arxiv.org/abs/2308.13904v1","updated":"2023-08-26T15:21:47Z","published":"2023-08-26T15:21:47Z","title":"LMSanitator: Defending Prompt-Tuning Against Task-Agnostic Backdoors","summary":" Prompt-tuning has emerged as an attractive paradigm for deploying large-scale\nlanguage models due to its strong downstream task performance and efficient\nmultitask serving ability. Despite its wide adoption, we empirically show that\nprompt-tuning is vulnerable to downstream task-agnostic backdoors, which reside\nin the pretrained models and can affect arbitrary downstream tasks. The\nstate-of-the-art backdoor detection approaches cannot defend against\ntask-agnostic backdoors since they hardly converge in reversing the backdoor\ntriggers. To address this issue, we propose LMSanitator, a novel approach for\ndetecting and removing task-agnostic backdoors on Transformer models. Instead\nof directly inversing the triggers, LMSanitator aims to inverse the predefined\nattack vectors (pretrained models' output when the input is embedded with\ntriggers) of the task-agnostic backdoors, which achieves much better\nconvergence performance and backdoor detection accuracy. LMSanitator further\nleverages prompt-tuning's property of freezing the pretrained model to perform\naccurate and fast output monitoring and input purging during the inference\nphase. Extensive experiments on multiple language models and NLP tasks\nillustrate the effectiveness of LMSanitator. For instance, LMSanitator achieves\n92.8% backdoor detection accuracy on 960 models and decreases the attack\nsuccess rate to less than 1% in most scenarios.\n","authors":["Chengkun Wei","Wenlong Meng","Zhikun Zhang","Min Chen","Minghu Zhao","Wenjing Fang","Lei Wang","Zihui Zhang","Wenzhi Chen"],"pdf_url":"https://arxiv.org/pdf/2308.13904v1.pdf","comment":"To Appear in the Network and Distributed System Security (NDSS)\n Symposium 2024, 26 February - 1 March 2024, San Diego, CA, USA"},{"id":"http://arxiv.org/abs/2307.02054v3","updated":"2023-08-26T11:02:16Z","published":"2023-07-05T06:38:52Z","title":"Emoji Prediction in Tweets using BERT","summary":" In recent years, the use of emojis in social media has increased\ndramatically, making them an important element in understanding online\ncommunication. However, predicting the meaning of emojis in a given text is a\nchallenging task due to their ambiguous nature. In this study, we propose a\ntransformer-based approach for emoji prediction using BERT, a widely-used\npre-trained language model. We fine-tuned BERT on a large corpus of text\n(tweets) containing both text and emojis to predict the most appropriate emoji\nfor a given text. Our experimental results demonstrate that our approach\noutperforms several state-of-the-art models in predicting emojis with an\naccuracy of over 75 percent. This work has potential applications in natural\nlanguage processing, sentiment analysis, and social media marketing.\n","authors":["Muhammad Osama Nusrat","Zeeshan Habib","Mehreen Alam","Saad Ahmed Jamal"],"pdf_url":"https://arxiv.org/pdf/2307.02054v3.pdf","comment":"This paper is focused on predicting emojis corresponding to tweets\n using BERT"},{"id":"http://arxiv.org/abs/2308.13844v1","updated":"2023-08-26T10:35:16Z","published":"2023-08-26T10:35:16Z","title":"Solving Math Word Problem with Problem Type Classification","summary":" Math word problems (MWPs) require analyzing text descriptions and generating\nmathematical equations to derive solutions. Existing works focus on solving\nMWPs with two types of solvers: tree-based solver and large language model\n(LLM) solver. However, these approaches always solve MWPs by a single solver,\nwhich will bring the following problems: (1) Single type of solver is hard to\nsolve all types of MWPs well. (2) A single solver will result in poor\nperformance due to over-fitting. To address these challenges, this paper\nutilizes multiple ensemble approaches to improve MWP-solving ability. Firstly,\nWe propose a problem type classifier that combines the strengths of the\ntree-based solver and the LLM solver. This ensemble approach leverages their\nrespective advantages and broadens the range of MWPs that can be solved.\nFurthermore, we also apply ensemble techniques to both tree-based solver and\nLLM solver to improve their performance. For the tree-based solver, we propose\nan ensemble learning framework based on ten-fold cross-validation and voting\nmechanism. In the LLM solver, we adopt self-consistency (SC) method to improve\nanswer selection. Experimental results demonstrate the effectiveness of these\nensemble approaches in enhancing MWP-solving ability. The comprehensive\nevaluation showcases improved performance, validating the advantages of our\nproposed approach. Our code is available at this url:\nhttps://github.com/zhouzihao501/NLPCC2023-Shared-Task3-ChineseMWP.\n","authors":["Jie Yao","Zihao Zhou","Qiufeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13844v1.pdf","comment":"Accpected by NLPCC2023"},{"id":"http://arxiv.org/abs/2307.07851v3","updated":"2023-08-26T07:24:28Z","published":"2023-07-15T17:01:56Z","title":"AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual\n Similarity Using Contrastive Learning and Structured Knowledge","summary":" Generic sentence embeddings provide a coarse-grained approximation of\nsemantic textual similarity but ignore specific aspects that make texts\nsimilar. Conversely, aspect-based sentence embeddings provide similarities\nbetween texts based on certain predefined aspects. Thus, similarity predictions\nof texts are more targeted to specific requirements and more easily\nexplainable. In this paper, we present AspectCSE, an approach for aspect-based\ncontrastive learning of sentence embeddings. Results indicate that AspectCSE\nachieves an average improvement of 3.97% on information retrieval tasks across\nmultiple aspects compared to the previous best results. We also propose using\nWikidata knowledge graph properties to train models of multi-aspect sentence\nembeddings in which multiple specific aspects are simultaneously considered\nduring similarity predictions. We demonstrate that multi-aspect embeddings\noutperform single-aspect embeddings on aspect-specific information retrieval\ntasks. Finally, we examine the aspect-based sentence embedding space and\ndemonstrate that embeddings of semantically similar aspect labels are often\nclose, even without explicit similarity training between different aspect\nlabels.\n","authors":["Tim Schopf","Emanuel Gerber","Malte Ostendorff","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.07851v3.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2308.13782v1","updated":"2023-08-26T06:28:14Z","published":"2023-08-26T06:28:14Z","title":"Planning with Logical Graph-based Language Model for Instruction\n Generation","summary":" Despite the superior performance of large language models to generate natural\nlanguage texts, it is hard to generate texts with correct logic according to a\ngiven task, due to the difficulties for neural models to capture implied rules\nfrom free-form texts. In this paper, we propose a novel graph-based language\nmodel, Logical-GLM, to infuse logic into language models for more valid text\ngeneration and interpretability. Specifically, we first capture information\nfrom natural language instructions and construct logical bayes graphs that\ngenerally describe domains. Next, we generate logical skeletons to guide\nlanguage model training, infusing domain knowledge into language models.\nFinally, we alternately optimize the searching policy of graphs and language\nmodels until convergence. The experimental results show that Logical-GLM is\nboth effective and efficient compared with traditional language models, despite\nusing smaller-scale training data and fewer parameters. Our approach can\ngenerate instructional texts with more correct logic owing to the internalized\ndomain knowledge. Moreover, the usage of logical graphs reflects the inner\nmechanism of the language models, which improves the interpretability of\nblack-box models.\n","authors":["Fan Zhang","Kebing Jin","Hankz Hankui Zhuo"],"pdf_url":"https://arxiv.org/pdf/2308.13782v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2210.11694v2","updated":"2023-08-26T05:57:06Z","published":"2022-10-21T02:44:55Z","title":"Multi-View Reasoning: Consistent Contrastive Learning for Math Word\n Problem","summary":" Math word problem solver requires both precise relation reasoning about\nquantities in the text and reliable generation for the diverse equation.\nCurrent sequence-to-tree or relation extraction methods regard this only from a\nfixed view, struggling to simultaneously handle complex semantics and diverse\nequations. However, human solving naturally involves two consistent reasoning\nviews: top-down and bottom-up, just as math equations also can be expressed in\nmultiple equivalent forms: pre-order and post-order. We propose a multi-view\nconsistent contrastive learning for a more complete semantics-to-equation\nmapping. The entire process is decoupled into two independent but consistent\nviews: top-down decomposition and bottom-up construction, and the two reasoning\nviews are aligned in multi-granularity for consistency, enhancing global\ngeneration and precise reasoning. Experiments on multiple datasets across two\nlanguages show our approach significantly outperforms the existing baselines,\nespecially on complex problems. We also show after consistent alignment,\nmulti-view can absorb the merits of both views and generate more diverse\nresults consistent with the mathematical laws.\n","authors":["Wenqi Zhang","Yongliang Shen","Yanna Ma","Xiaoxia Cheng","Zeqi Tan","Qingpeng Nong","Weiming Lu"],"pdf_url":"https://arxiv.org/pdf/2210.11694v2.pdf","comment":"14 pages, 5 figures, 3 appendix figures"},{"id":"http://arxiv.org/abs/2304.06634v2","updated":"2023-08-26T05:55:48Z","published":"2023-04-13T16:02:19Z","title":"PGTask: Introducing the Task of Profile Generation from Dialogues","summary":" Recent approaches have attempted to personalize dialogue systems by\nleveraging profile information into models. However, this knowledge is scarce\nand difficult to obtain, which makes the extraction/generation of profile\ninformation from dialogues a fundamental asset. To surpass this limitation, we\nintroduce the Profile Generation Task (PGTask). We contribute with a new\ndataset for this problem, comprising profile sentences aligned with related\nutterances, extracted from a corpus of dialogues. Furthermore, using\nstate-of-the-art methods, we provide a benchmark for profile generation on this\nnovel dataset. Our experiments disclose the challenges of profile generation,\nand we hope that this introduces a new research direction.\n","authors":["Rui Ribeiro","Joao P. Carvalho","Luísa Coheur"],"pdf_url":"https://arxiv.org/pdf/2304.06634v2.pdf","comment":"Accepted at SIGDIAL 2023, 4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.13775v1","updated":"2023-08-26T05:48:57Z","published":"2023-08-26T05:48:57Z","title":"EditSum: A Retrieve-and-Edit Framework for Source Code Summarization","summary":" Existing studies show that code summaries help developers understand and\nmaintain source code. Unfortunately, these summaries are often missing or\noutdated in software projects. Code summarization aims to generate natural\nlanguage descriptions automatically for source code. Code summaries are highly\nstructured and have repetitive patterns. Besides the patternized words, a code\nsummary also contains important keywords, which are the key to reflecting the\nfunctionality of the code. However, the state-of-the-art approaches perform\npoorly on predicting the keywords, which leads to the generated summaries\nsuffering a loss in informativeness. To alleviate this problem, this paper\nproposes a novel retrieve-and-edit approach named EditSum for code\nsummarization. Specifically, EditSum first retrieves a similar code snippet\nfrom a pre-defined corpus and treats its summary as a prototype summary to\nlearn the pattern. Then, EditSum edits the prototype automatically to combine\nthe pattern in the prototype with the semantic information of input code. Our\nmotivation is that the retrieved prototype provides a good start-point for\npost-generation because the summaries of similar code snippets often have the\nsame pattern. The post-editing process further reuses the patternized words in\nthe prototype and generates keywords based on the semantic information of input\ncode. We conduct experiments on a large-scale Java corpus and experimental\nresults demonstrate that EditSum outperforms the state-of-the-art approaches by\na substantial margin. The human evaluation also proves the summaries generated\nby EditSum are more informative and useful. We also verify that EditSum\nperforms well on predicting the patternized words and keywords.\n","authors":["Jia Allen Li","Yongmin Li","Ge Li","Xing Hu","Xin Xia","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2308.13775v1.pdf","comment":"Accepted by the 36th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2021)"},{"id":"http://arxiv.org/abs/2308.13768v1","updated":"2023-08-26T05:20:58Z","published":"2023-08-26T05:20:58Z","title":"Adversarial Fine-Tuning of Language Models: An Iterative Optimisation\n Approach for the Generation and Detection of Problematic Content","summary":" In this paper, we tackle the emerging challenge of unintended harmful content\ngeneration in Large Language Models (LLMs) with a novel dual-stage optimisation\ntechnique using adversarial fine-tuning. Our two-pronged approach employs an\nadversarial model, fine-tuned to generate potentially harmful prompts, and a\njudge model, iteratively optimised to discern these prompts. In this\nadversarial cycle, the two models seek to outperform each other in the\nprompting phase, generating a dataset of rich examples which are then used for\nfine-tuning. This iterative application of prompting and fine-tuning allows\ncontinuous refinement and improved performance. The performance of our approach\nis evaluated through classification accuracy on a dataset consisting of\nproblematic prompts not detected by GPT-4, as well as a selection of\ncontentious but unproblematic prompts. We show considerable increase in\nclassification accuracy of the judge model on this challenging dataset as it\nundergoes the optimisation process. Furthermore, we show that a rudimentary\nmodel \\texttt{ada} can achieve 13\\% higher accuracy on the hold-out test set\nthan GPT-4 after only a few rounds of this process, and that this fine-tuning\nimproves performance in parallel tasks such as toxic comment identification.\n","authors":["Charles O'Neill","Jack Miller","Ioana Ciuca","Yuan-Sen Ting","Thang Bui"],"pdf_url":"https://arxiv.org/pdf/2308.13768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13760v1","updated":"2023-08-26T04:49:46Z","published":"2023-08-26T04:49:46Z","title":"How Can Context Help? Exploring Joint Retrieval of Passage and\n Personalized Context","summary":" The integration of external personalized context information into\ndocument-grounded conversational systems has significant potential business\nvalue, but has not been well-studied. Motivated by the concept of personalized\ncontext-aware document-grounded conversational systems, we introduce the task\nof context-aware passage retrieval. We also construct a dataset specifically\ncurated for this purpose. We describe multiple baseline systems to address this\ntask, and propose a novel approach, Personalized Context-Aware Search (PCAS),\nthat effectively harnesses contextual information during passage retrieval.\nExperimental evaluations conducted on multiple popular dense retrieval systems\ndemonstrate that our proposed approach not only outperforms the baselines in\nretrieving the most relevant passage but also excels at identifying the\npertinent context among all the available contexts. We envision that our\ncontributions will serve as a catalyst for inspiring future research endeavors\nin this promising direction.\n","authors":["Hui Wan","Hongkang Li","Songtao Lu","Xiaodong Cui","Marina Danilevsky"],"pdf_url":"https://arxiv.org/pdf/2308.13760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13754v1","updated":"2023-08-26T03:48:10Z","published":"2023-08-26T03:48:10Z","title":"ZC3: Zero-Shot Cross-Language Code Clone Detection","summary":" Developers introduce code clones to improve programming productivity. Many\nexisting studies have achieved impressive performance in monolingual code clone\ndetection. However, during software development, more and more developers write\nsemantically equivalent programs with different languages to support different\nplatforms and help developers translate projects from one language to another.\nConsidering that collecting cross-language parallel data, especially for\nlow-resource languages, is expensive and time-consuming, how designing an\neffective cross-language model that does not rely on any parallel data is a\nsignificant problem. In this paper, we propose a novel method named ZC3 for\nZero-shot Cross-language Code Clone detection. ZC3 designs the contrastive\nsnippet prediction to form an isomorphic representation space among different\nprogramming languages. Based on this, ZC3 exploits domain-aware learning and\ncycle consistency learning to further constrain the model to generate\nrepresentations that are aligned among different languages meanwhile are\ndiacritical for different types of clones. To evaluate our approach, we conduct\nextensive experiments on four representative cross-language clone detection\ndatasets. Experimental results show that ZC3 outperforms the state-of-the-art\nbaselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively.\nWe further investigate the representational distribution of different languages\nand discuss the effectiveness of our method.\n","authors":["Jia Li","Chongyang Tao","Zhi Jin","Fang Liu","Jia Allen Li","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2308.13754v1.pdf","comment":"Accepted by the 38th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2023)"},{"id":"http://arxiv.org/abs/2308.13738v1","updated":"2023-08-26T02:52:42Z","published":"2023-08-26T02:52:42Z","title":"On Philomatics and Psychomatics for Combining Philosophy and Psychology\n with Mathematics","summary":" We propose the concepts of philomatics and psychomatics as hybrid\ncombinations of philosophy and psychology with mathematics. We explain four\nmotivations for this combination which are fulfilling the desire of analytical\nphilosophy, proposing science of philosophy, justifying mathematical algorithms\nby philosophy, and abstraction in both philosophy and mathematics. We enumerate\nvarious examples for philomatics and psychomatics, some of which are explained\nin more depth. The first example is the analysis of relation between the\ncontext principle, semantic holism, and the usage theory of meaning with the\nattention mechanism in mathematics. The other example is on the relations of\nPlato's theory of forms in philosophy with the holographic principle in string\ntheory, object-oriented programming, and machine learning. Finally, the\nrelation between Wittgenstein's family resemblance and clustering in\nmathematics is explained. This paper opens the door of research for combining\nphilosophy and psychology with mathematics.\n","authors":["Benyamin Ghojogh","Morteza Babaie"],"pdf_url":"https://arxiv.org/pdf/2308.13738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18703v5","updated":"2023-08-26T02:42:49Z","published":"2023-05-30T03:00:30Z","title":"Domain Specialization as the Key to Make Large Language Models\n Disruptive: A Comprehensive Survey","summary":" Large language models (LLMs) have significantly advanced the field of natural\nlanguage processing (NLP), providing a highly useful, task-agnostic foundation\nfor a wide range of applications. However, directly applying LLMs to solve\nsophisticated problems in specific domains meets many hurdles, caused by the\nheterogeneity of domain data, the sophistication of domain knowledge, the\nuniqueness of domain objectives, and the diversity of the constraints (e.g.,\nvarious social norms, cultural conformity, religious beliefs, and ethical\nstandards in the domain applications). Domain specification techniques are key\nto make large language models disruptive in many applications. Specifically, to\nsolve these hurdles, there has been a notable increase in research and\npractices conducted in recent years on the domain specialization of LLMs. This\nemerging field of study, with its substantial potential for impact,\nnecessitates a comprehensive and systematic review to better summarize and\nguide ongoing work in this area. In this article, we present a comprehensive\nsurvey on domain specification techniques for large language models, an\nemerging direction critical for large language model applications. First, we\npropose a systematic taxonomy that categorizes the LLM domain-specialization\ntechniques based on the accessibility to LLMs and summarizes the framework for\nall the subcategories as well as their relations and differences to each other.\nSecond, we present an extensive taxonomy of critical application domains that\ncan benefit dramatically from specialized LLMs, discussing their practical\nsignificance and open challenges. Last, we offer our insights into the current\nresearch status and future trends in this area.\n","authors":["Chen Ling","Xujiang Zhao","Jiaying Lu","Chengyuan Deng","Can Zheng","Junxiang Wang","Tanmoy Chowdhury","Yun Li","Hejie Cui","Xuchao Zhang","Tianjiao Zhao","Amit Panalkar","Wei Cheng","Haoyu Wang","Yanchi Liu","Zhengzhang Chen","Haifeng Chen","Chris White","Quanquan Gu","Jian Pei","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.18703v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04802v2","updated":"2023-08-26T02:21:05Z","published":"2023-06-07T21:51:56Z","title":"A Survey on Knowledge Graphs for Healthcare: Resources, Applications,\n and Promises","summary":" Healthcare knowledge graphs (HKGs) have emerged as a promising tool for\norganizing medical knowledge in a structured and interpretable way, which\nprovides a comprehensive view of medical concepts and their relationships.\nHowever, challenges such as data heterogeneity and limited coverage remain,\nemphasizing the need for further research in the field of HKGs. This survey\npaper serves as the first comprehensive overview of HKGs. We summarize the\npipeline and key techniques for HKG construction (i.e., from scratch and\nthrough integration), as well as the common utilization approaches (i.e.,\nmodel-free and model-based). To provide researchers with valuable resources, we\norganize existing HKGs (The resource is available at\nhttps://github.com/lujiaying/Awesome-HealthCare-KnowledgeBase) based on the\ndata types they capture and application domains, supplemented with pertinent\nstatistical information. In the application section, we delve into the\ntransformative impact of HKGs across various healthcare domains, spanning from\nfine-grained basic science research to high-level clinical decision support.\nLastly, we shed light on the opportunities for creating comprehensive and\naccurate HKGs in the era of large language models, presenting the potential to\nrevolutionize healthcare delivery and enhance the interpretability and\nreliability of clinical prediction.\n","authors":["Hejie Cui","Jiaying Lu","Shiyu Wang","Ran Xu","Wenjing Ma","Shaojun Yu","Yue Yu","Xuan Kan","Chen Ling","Liang Zhao","Joyce Ho","Fei Wang","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2306.04802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11504v2","updated":"2023-08-26T00:33:23Z","published":"2023-03-20T23:54:26Z","title":"Language Model Behavior: A Comprehensive Survey","summary":" Transformer language models have received widespread public attention, yet\ntheir generated text is often surprising even to NLP researchers. In this\nsurvey, we discuss over 250 recent studies of English language model behavior\nbefore task-specific fine-tuning. Language models possess basic capabilities in\nsyntax, semantics, pragmatics, world knowledge, and reasoning, but these\ncapabilities are sensitive to specific inputs and surface features. Despite\ndramatic increases in generated text quality as models scale to hundreds of\nbillions of parameters, the models are still prone to unfactual responses,\ncommonsense errors, memorized text, and social biases. Many of these weaknesses\ncan be framed as over-generalizations or under-generalizations of learned\npatterns in text. We synthesize recent results to highlight what is currently\nknown about large language model capabilities, thus providing a resource for\napplied work and for research in adjacent fields that use language models.\n","authors":["Tyler A. Chang","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2303.11504v2.pdf","comment":"32 pages, accepted to Computational Linguistics"},{"id":"http://arxiv.org/abs/2308.13715v1","updated":"2023-08-26T00:27:08Z","published":"2023-08-26T00:27:08Z","title":"A Computational Evaluation Framework for Singable Lyric Translation","summary":" Lyric translation plays a pivotal role in amplifying the global resonance of\nmusic, bridging cultural divides, and fostering universal connections.\nTranslating lyrics, unlike conventional translation tasks, requires a delicate\nbalance between singability and semantics. In this paper, we present a\ncomputational framework for the quantitative evaluation of singable lyric\ntranslation, which seamlessly integrates musical, linguistic, and cultural\ndimensions of lyrics. Our comprehensive framework consists of four metrics that\nmeasure syllable count distance, phoneme repetition similarity, musical\nstructure distance, and semantic similarity. To substantiate the efficacy of\nour framework, we collected a singable lyrics dataset, which precisely aligns\nEnglish, Japanese, and Korean lyrics on a line-by-line and section-by-section\nbasis, and conducted a comparative analysis between singable and non-singable\nlyrics. Our multidisciplinary approach provides insights into the key\ncomponents that underlie the art of lyric translation and establishes a solid\ngroundwork for the future of computational lyric translation assessment.\n","authors":["Haven Kim","Kento Watanabe","Masataka Goto","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2308.13715v1.pdf","comment":"ISMIR 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.13820v1","updated":"2023-08-26T09:02:21Z","published":"2023-08-26T09:02:21Z","title":"Video and Audio are Images: A Cross-Modal Mixer for Original Data on\n Video-Audio Retrieval","summary":" Cross-modal retrieval has become popular in recent years, particularly with\nthe rise of multimedia. Generally, the information from each modality exhibits\ndistinct representations and semantic information, which makes feature tends to\nbe in separate latent spaces encoded with dual-tower architecture and makes it\ndifficult to establish semantic relationships between modalities, resulting in\npoor retrieval performance. To address this issue, we propose a novel framework\nfor cross-modal retrieval which consists of a cross-modal mixer, a masked\nautoencoder for pre-training, and a cross-modal retriever for downstream\ntasks.In specific, we first adopt cross-modal mixer and mask modeling to fuse\nthe original modality and eliminate redundancy. Then, an encoder-decoder\narchitecture is applied to achieve a fuse-then-separate task in the\npre-training phase.We feed masked fused representations into the encoder and\nreconstruct them with the decoder, ultimately separating the original data of\ntwo modalities. In downstream tasks, we use the pre-trained encoder to build\nthe cross-modal retrieval method. Extensive experiments on 2 real-world\ndatasets show that our approach outperforms previous state-of-the-art methods\nin video-audio matching tasks, improving retrieval accuracy by up to 2 times.\nFurthermore, we prove our model performance by transferring it to other\ndownstream tasks as a universal model.\n","authors":["Zichen Yuan","Qi Shen","Bingyi Zheng","Yuting Liu","Linying Jiang","Guibing Guo"],"pdf_url":"https://arxiv.org/pdf/2308.13820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09649v2","updated":"2023-08-26T07:24:00Z","published":"2023-08-18T16:10:13Z","title":"MUSE: Music Recommender System with Shuffle Play Recommendation\n Enhancement","summary":" Recommender systems have become indispensable in music streaming services,\nenhancing user experiences by personalizing playlists and facilitating the\nserendipitous discovery of new music. However, the existing recommender systems\noverlook the unique challenges inherent in the music domain, specifically\nshuffle play, which provides subsequent tracks in a random sequence. Based on\nour observation that the shuffle play sessions hinder the overall training\nprocess of music recommender systems mainly due to the high unique transition\nrates of shuffle play sessions, we propose a Music Recommender System with\nShuffle Play Recommendation Enhancement (MUSE). MUSE employs the\nself-supervised learning framework that maximizes the agreement between the\noriginal session and the augmented session, which is augmented by our novel\nsession augmentation method, called transition-based augmentation. To further\nfacilitate the alignment of the representations between the two views, we\ndevise two fine-grained matching strategies, i.e., item- and similarity-based\nmatching strategies. Through rigorous experiments conducted across diverse\nenvironments, we demonstrate MUSE's efficacy over 12 baseline models on a\nlarge-scale Music Streaming Sessions Dataset (MSSD) from Spotify. The source\ncode of MUSE is available at \\url{https://github.com/yunhak0/MUSE}.\n","authors":["Yunhak Oh","Sukwon Yun","Dongmin Hyun","Sein Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2308.09649v2.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.13774v1","updated":"2023-08-26T05:43:29Z","published":"2023-08-26T05:43:29Z","title":"Central Similarity Multi-View Hashing for Multimedia Retrieval","summary":" Hash representation learning of multi-view heterogeneous data is the key to\nimproving the accuracy of multimedia retrieval. However, existing methods\nutilize local similarity and fall short of deeply fusing the multi-view\nfeatures, resulting in poor retrieval accuracy. Current methods only use local\nsimilarity to train their model. These methods ignore global similarity.\nFurthermore, most recent works fuse the multi-view features via a weighted sum\nor concatenation. We contend that these fusion methods are insufficient for\ncapturing the interaction between various views. We present a novel Central\nSimilarity Multi-View Hashing (CSMVH) method to address the mentioned problems.\nCentral similarity learning is used for solving the local similarity problem,\nwhich can utilize the global similarity between the hash center and samples. We\npresent copious empirical data demonstrating the superiority of gate-based\nfusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed\nCSMVH performs better than the state-of-the-art methods by a large margin (up\nto 11.41% mean Average Precision (mAP) improvement).\n","authors":["Jian Zhu","Wen Cheng","Yu Cui","Chang Tang","Yuyang Dai","Yong Li","Lingfang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.13774v1.pdf","comment":"accepted by the Asia Pacific Web (APWeb) and Web-Age Information\n Management (WAIM) Joint International Conference on Web and Big Data\n (APWeb-WAIM2023)"},{"id":"http://arxiv.org/abs/2308.13760v1","updated":"2023-08-26T04:49:46Z","published":"2023-08-26T04:49:46Z","title":"How Can Context Help? Exploring Joint Retrieval of Passage and\n Personalized Context","summary":" The integration of external personalized context information into\ndocument-grounded conversational systems has significant potential business\nvalue, but has not been well-studied. Motivated by the concept of personalized\ncontext-aware document-grounded conversational systems, we introduce the task\nof context-aware passage retrieval. We also construct a dataset specifically\ncurated for this purpose. We describe multiple baseline systems to address this\ntask, and propose a novel approach, Personalized Context-Aware Search (PCAS),\nthat effectively harnesses contextual information during passage retrieval.\nExperimental evaluations conducted on multiple popular dense retrieval systems\ndemonstrate that our proposed approach not only outperforms the baselines in\nretrieving the most relevant passage but also excels at identifying the\npertinent context among all the available contexts. We envision that our\ncontributions will serve as a catalyst for inspiring future research endeavors\nin this promising direction.\n","authors":["Hui Wan","Hongkang Li","Songtao Lu","Xiaodong Cui","Marina Danilevsky"],"pdf_url":"https://arxiv.org/pdf/2308.13760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13754v1","updated":"2023-08-26T03:48:10Z","published":"2023-08-26T03:48:10Z","title":"ZC3: Zero-Shot Cross-Language Code Clone Detection","summary":" Developers introduce code clones to improve programming productivity. Many\nexisting studies have achieved impressive performance in monolingual code clone\ndetection. However, during software development, more and more developers write\nsemantically equivalent programs with different languages to support different\nplatforms and help developers translate projects from one language to another.\nConsidering that collecting cross-language parallel data, especially for\nlow-resource languages, is expensive and time-consuming, how designing an\neffective cross-language model that does not rely on any parallel data is a\nsignificant problem. In this paper, we propose a novel method named ZC3 for\nZero-shot Cross-language Code Clone detection. ZC3 designs the contrastive\nsnippet prediction to form an isomorphic representation space among different\nprogramming languages. Based on this, ZC3 exploits domain-aware learning and\ncycle consistency learning to further constrain the model to generate\nrepresentations that are aligned among different languages meanwhile are\ndiacritical for different types of clones. To evaluate our approach, we conduct\nextensive experiments on four representative cross-language clone detection\ndatasets. Experimental results show that ZC3 outperforms the state-of-the-art\nbaselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively.\nWe further investigate the representational distribution of different languages\nand discuss the effectiveness of our method.\n","authors":["Jia Li","Chongyang Tao","Zhi Jin","Fang Liu","Jia Allen Li","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2308.13754v1.pdf","comment":"Accepted by the 38th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2023)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.13879v1","updated":"2023-08-26T13:34:17Z","published":"2023-08-26T13:34:17Z","title":"The DiffuseStyleGesture+ entry to the GENEA Challenge 2023","summary":" In this paper, we introduce the DiffuseStyleGesture+, our solution for the\nGeneration and Evaluation of Non-verbal Behavior for Embodied Agents (GENEA)\nChallenge 2023, which aims to foster the development of realistic, automated\nsystems for generating conversational gestures. Participants are provided with\na pre-processed dataset and their systems are evaluated through crowdsourced\nscoring. Our proposed model, DiffuseStyleGesture+, leverages a diffusion model\nto generate gestures automatically. It incorporates a variety of modalities,\nincluding audio, text, speaker ID, and seed gestures. These diverse modalities\nare mapped to a hidden space and processed by a modified diffusion model to\nproduce the corresponding gesture for a given speech input. Upon evaluation,\nthe DiffuseStyleGesture+ demonstrated performance on par with the top-tier\nmodels in the challenge, showing no significant differences with those models\nin human-likeness, appropriateness for the interlocutor, and achieving\ncompetitive performance with the best model on appropriateness for agent\nspeech. This indicates that our model is competitive and effective in\ngenerating realistic and appropriate gestures for given speech. The code,\npre-trained models, and demos are available at\nhttps://github.com/YoungSeng/DiffuseStyleGesture/tree/DiffuseStyleGesturePlus/BEAT-TWH-main.\n","authors":["Sicheng Yang","Haiwei Xue","Zhensong Zhang","Minglei Li","Zhiyong Wu","Xiaofei Wu","Songcen Xu","Zonghong Dai"],"pdf_url":"https://arxiv.org/pdf/2308.13879v1.pdf","comment":"7 pages, 8 figures, ICMI 2023"},{"id":"http://arxiv.org/abs/2308.04156v2","updated":"2023-08-26T08:40:25Z","published":"2023-08-08T09:37:18Z","title":"Towards Top-Down Stereoscopic Image Quality Assessment via Stereo\n Attention","summary":" Stereoscopic image quality assessment (SIQA) plays a crucial role in\nevaluating and improving the visual experience of 3D content. Existing\nbinocular properties and attention-based methods for SIQA have achieved\npromising performance. However, these bottom-up approaches are inadequate in\nexploiting the inherent characteristics of the human visual system (HVS). This\npaper presents a novel network for SIQA via stereo attention, employing a\ntop-down perspective to guide the quality assessment process. Our proposed\nmethod realizes the guidance from high-level binocular signals down to\nlow-level monocular signals, while the binocular and monocular information can\nbe calibrated progressively throughout the processing pipeline. We design a\ngeneralized Stereo AttenTion (SAT) block to implement the top-down philosophy\nin stereo perception. This block utilizes the fusion-generated attention map as\na high-level binocular modulator, influencing the representation of two\nlow-level monocular features. Additionally, we introduce an Energy Coefficient\n(EC) to account for recent findings indicating that binocular responses in the\nprimate primary visual cortex are less than the sum of monocular responses. The\nadaptive EC can tune the magnitude of binocular response flexibly, thus\nenhancing the formation of robust binocular features within our framework. To\nextract the most discriminative quality information from the summation and\nsubtraction of the two branches of monocular features, we utilize a\ndual-pooling strategy that applies min-pooling and max-pooling operations to\nthe respective branches. Experimental results highlight the superiority of our\ntop-down method in simulating the property of visual perception and advancing\nthe state-of-the-art in the SIQA field. The code of this work is available at\nhttps://github.com/Fanning-Zhang/SATNet.\n","authors":["Huilin Zhang","Sumei Li","Yongli Chang"],"pdf_url":"https://arxiv.org/pdf/2308.04156v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.13801v1","updated":"2023-08-26T07:55:32Z","published":"2023-08-26T07:55:32Z","title":"Reinforcement Learning Based Multi-modal Feature Fusion Network for\n Novel Class Discovery","summary":" With the development of deep learning techniques, supervised learning has\nachieved performances surpassing those of humans. Researchers have designed\nnumerous corresponding models for different data modalities, achieving\nexcellent results in supervised tasks. However, with the exponential increase\nof data in multiple fields, the recognition and classification of unlabeled\ndata have gradually become a hot topic. In this paper, we employed a\nReinforcement Learning framework to simulate the cognitive processes of humans\nfor effectively addressing novel class discovery in the Open-set domain. We\ndeployed a Member-to-Leader Multi-Agent framework to extract and fuse features\nfrom multi-modal information, aiming to acquire a more comprehensive\nunderstanding of the feature space. Furthermore, this approach facilitated the\nincorporation of self-supervised learning to enhance model training. We\nemployed a clustering method with varying constraint conditions, ranging from\nstrict to loose, allowing for the generation of dependable labels for a subset\nof unlabeled data during the training phase. This iterative process is similar\nto human exploratory learning of unknown data. These mechanisms collectively\nupdate the network parameters based on rewards received from environmental\nfeedback. This process enables effective control over the extent of exploration\nlearning, ensuring the accuracy of learning in unknown data categories. We\ndemonstrate the performance of our approach in both the 3D and 2D domains by\nemploying the OS-MN40, OS-MN40-Miss, and Cifar10 datasets. Our approach\nachieves competitive competitive results.\n","authors":["Qiang Li","Qiuyang Ma","Weizhi Nie","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13774v1","updated":"2023-08-26T05:43:29Z","published":"2023-08-26T05:43:29Z","title":"Central Similarity Multi-View Hashing for Multimedia Retrieval","summary":" Hash representation learning of multi-view heterogeneous data is the key to\nimproving the accuracy of multimedia retrieval. However, existing methods\nutilize local similarity and fall short of deeply fusing the multi-view\nfeatures, resulting in poor retrieval accuracy. Current methods only use local\nsimilarity to train their model. These methods ignore global similarity.\nFurthermore, most recent works fuse the multi-view features via a weighted sum\nor concatenation. We contend that these fusion methods are insufficient for\ncapturing the interaction between various views. We present a novel Central\nSimilarity Multi-View Hashing (CSMVH) method to address the mentioned problems.\nCentral similarity learning is used for solving the local similarity problem,\nwhich can utilize the global similarity between the hash center and samples. We\npresent copious empirical data demonstrating the superiority of gate-based\nfusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed\nCSMVH performs better than the state-of-the-art methods by a large margin (up\nto 11.41% mean Average Precision (mAP) improvement).\n","authors":["Jian Zhu","Wen Cheng","Yu Cui","Chang Tang","Yuyang Dai","Yong Li","Lingfang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.13774v1.pdf","comment":"accepted by the Asia Pacific Web (APWeb) and Web-Age Information\n Management (WAIM) Joint International Conference on Web and Big Data\n (APWeb-WAIM2023)"}]},"2023-08-29T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2201.06313v4","updated":"2023-08-29T17:54:26Z","published":"2022-01-17T09:54:35Z","title":"A Deep Convolutional Neural Networks Based Multi-Task Ensemble Model for\n Aspect and Polarity Classification in Persian Reviews","summary":" Aspect-based sentiment analysis is of great importance and application\nbecause of its ability to identify all aspects discussed in the text. However,\naspect-based sentiment analysis will be most effective when, in addition to\nidentifying all the aspects discussed in the text, it can also identify their\npolarity. Most previous methods use the pipeline approach, that is, they first\nidentify the aspects and then identify the polarities. Such methods are\nunsuitable for practical applications since they can lead to model errors.\nTherefore, in this study, we propose a multi-task learning model based on\nConvolutional Neural Networks (CNNs), which can simultaneously detect aspect\ncategory and detect aspect category polarity. creating a model alone may not\nprovide the best predictions and lead to errors such as bias and high variance.\nTo reduce these errors and improve the efficiency of model predictions,\ncombining several models known as ensemble learning may provide better results.\nTherefore, the main purpose of this article is to create a model based on an\nensemble of multi-task deep convolutional neural networks to enhance sentiment\nanalysis in Persian reviews. We evaluated the proposed method using a Persian\nlanguage dataset in the movie domain. Jacquard index and Hamming loss measures\nwere used to evaluate the performance of the developed models. The results\nindicate that this new approach increases the efficiency of the sentiment\nanalysis model in the Persian language.\n","authors":["Milad Vazan","Fatemeh Sadat Masoumi","Sepideh Saeedi Majd"],"pdf_url":"https://arxiv.org/pdf/2201.06313v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14120v2","updated":"2023-08-29T17:52:02Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this\ngap and perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT ADA without specific guidance. ChatGPT ADA autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT ADA offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15459v1","updated":"2023-08-29T17:36:02Z","published":"2023-08-29T17:36:02Z","title":"ParaGuide: Guided Diffusion Paraphrasers for Plug-and-Play Textual Style\n Transfer","summary":" Textual style transfer is the task of transforming stylistic properties of\ntext while preserving meaning. Target \"styles\" can be defined in numerous ways,\nranging from single attributes (e.g, formality) to authorship (e.g,\nShakespeare). Previous unsupervised style-transfer approaches generally rely on\nsignificant amounts of labeled data for only a fixed set of styles or require\nlarge language models. In contrast, we introduce a novel diffusion-based\nframework for general-purpose style transfer that can be flexibly adapted to\narbitrary target styles at inference time. Our parameter-efficient approach,\nParaGuide, leverages paraphrase-conditioned diffusion models alongside\ngradient-based guidance from both off-the-shelf classifiers and strong existing\nstyle embedders to transform the style of text while preserving semantic\ninformation. We validate the method on the Enron Email Corpus, with both human\nand automatic evaluations, and find that it outperforms strong baselines on\nformality, sentiment, and even authorship style transfer.\n","authors":["Zachary Horvitz","Ajay Patel","Chris Callison-Burch","Zhou Yu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2308.15459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15452v1","updated":"2023-08-29T17:22:39Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.15448v1","updated":"2023-08-29T17:19:32Z","published":"2023-08-29T17:19:32Z","title":"Vulgar Remarks Detection in Chittagonian Dialect of Bangla","summary":" The negative effects of online bullying and harassment are increasing with\nInternet popularity, especially in social media. One solution is using natural\nlanguage processing (NLP) and machine learning (ML) methods for the automatic\ndetection of harmful remarks, but these methods are limited in low-resource\nlanguages like the Chittagonian dialect of Bangla.This study focuses on\ndetecting vulgar remarks in social media using supervised ML and deep learning\nalgorithms.Logistic Regression achieved promising accuracy (0.91) while simple\nRNN with Word2vec and fastTex had lower accuracy (0.84-0.90), highlighting the\nissue that NN algorithms require more data.\n","authors":["Tanjim Mahmud","Michal Ptaszynski","Fumito Masui"],"pdf_url":"https://arxiv.org/pdf/2308.15448v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.08018v2","updated":"2023-08-29T17:13:05Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a meticulously curated, comprehensive\ninstruction dataset expressly designed for the biomolecular realm.\nMol-Instructions is composed of three pivotal components: molecule-oriented\ninstructions, protein-oriented instructions, and biomolecular text\ninstructions, each curated to enhance the understanding and prediction\ncapabilities of LLMs concerning biomolecular features and behaviors. Through\nextensive instruction tuning experiments on the representative LLM, we\nunderscore the potency of Mol-Instructions to enhance the adaptability and\ncognitive acuity of large models within the complex sphere of biomolecular\nstudies, thereby promoting advancements in the biomolecular research community.\nMol-Instructions is made publicly accessible for future research endeavors and\nwill be subjected to continual updates for enhanced applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v2.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions. Add\n quantitative evaluations"},{"id":"http://arxiv.org/abs/2112.09153v2","updated":"2023-08-29T17:04:19Z","published":"2021-12-16T19:00:55Z","title":"An Empirical Investigation of the Role of Pre-training in Lifelong\n Learning","summary":" The lifelong learning paradigm in machine learning is an attractive\nalternative to the more prominent isolated learning scheme not only due to its\nresemblance to biological learning but also its potential to reduce energy\nwaste by obviating excessive model re-training. A key challenge to this\nparadigm is the phenomenon of catastrophic forgetting. With the increasing\npopularity and success of pre-trained models in machine learning, we pose the\nquestion: What role does pre-training play in lifelong learning, specifically\nwith respect to catastrophic forgetting? We investigate existing methods in the\ncontext of large, pre-trained models and evaluate their performance on a\nvariety of text and image classification tasks, including a large-scale study\nusing a novel data set of 15 diverse NLP tasks. Across all settings, we observe\nthat generic pre-training implicitly alleviates the effects of catastrophic\nforgetting when learning multiple tasks sequentially compared to randomly\ninitialized models. We then further investigate why pre-training alleviates\nforgetting in this setting. We study this phenomenon by analyzing the loss\nlandscape, finding that pre-trained weights appear to ease forgetting by\nleading to wider minima. Based on this insight, we propose jointly optimizing\nfor current task loss and loss basin sharpness to explicitly encourage wider\nbasins during sequential fine-tuning. We show that this optimization approach\noutperforms several state-of-the-art task-sequential continual learning\nalgorithms across multiple settings, occasionally even without retaining a\nmemory that scales in size with the number of tasks.\n","authors":["Sanket Vaibhav Mehta","Darshan Patil","Sarath Chandar","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2112.09153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07740v2","updated":"2023-08-29T16:55:11Z","published":"2023-07-15T08:08:38Z","title":"Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model","summary":" Sentiment analysis is the process of identifying and categorizing people's\nemotions or opinions regarding various topics. The analysis of Twitter\nsentiment has become an increasingly popular topic in recent years. In this\npaper, we present several machine learning and a deep learning model to\nanalysis sentiment of Persian political tweets. Our analysis was conducted\nusing Bag of Words and ParsBERT for word representation. We applied Gaussian\nNaive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random\nForests, as well as a combination of CNN and LSTM to classify the polarities of\ntweets. The results of this study indicate that deep learning with ParsBERT\nembedding performs better than machine learning. The CNN-LSTM model had the\nhighest classification accuracy with 89 percent on the first dataset and 71\npercent on the second dataset. Due to the complexity of Persian, it was a\ndifficult task to achieve this level of efficiency. The main objective of our\nresearch was to reduce the training time while maintaining the model's\nperformance. As a result, several adjustments were made to the model\narchitecture and parameters. In addition to achieving the objective, the\nperformance was slightly improved as well.\n","authors":["Mohammad Dehghani","Zahra Yazdanparast"],"pdf_url":"https://arxiv.org/pdf/2307.07740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15419v1","updated":"2023-08-29T16:24:09Z","published":"2023-08-29T16:24:09Z","title":"Characterizing Learning Curves During Language Model Pre-Training:\n Learning, Forgetting, and Stability","summary":" How do language models learn to make predictions during pre-training? To\nstudy this question, we extract learning curves from five autoregressive\nEnglish language model pre-training runs, for 1M tokens in context. We observe\nthat the language models generate short repetitive phrases before learning to\ngenerate longer and more coherent text. We quantify the final surprisal,\nwithin-run variability, age of acquisition, forgettability, and cross-run\nvariability of learning curves for individual tokens in context. More frequent\ntokens reach lower final surprisals, exhibit less variability within and across\npre-training runs, are learned earlier, and are less likely to be \"forgotten\"\nduring pre-training. Higher n-gram probabilities further accentuate these\neffects. Independent of the target token, shorter and more frequent contexts\ncorrelate with marginally more stable and quickly acquired predictions. Effects\nof part-of-speech are also small, although nouns tend to be acquired later and\nless stably than verbs, adverbs, and adjectives. Our work contributes to a\nbetter understanding of language model pre-training dynamics and informs the\ndeployment of stable language models in practice.\n","authors":["Tyler A. Chang","Zhuowen Tu","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2308.15419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15399v1","updated":"2023-08-29T15:57:32Z","published":"2023-08-29T15:57:32Z","title":"Rethinking Machine Ethics -- Can LLMs Perform Moral Reasoning through\n the Lens of Moral Theories?","summary":" Making moral judgments is an essential step toward developing ethical AI\nsystems. Prevalent approaches are mostly implemented in a bottom-up manner,\nwhich uses a large set of annotated data to train models based on crowd-sourced\nopinions about morality. These approaches have been criticized for potentially\novergeneralizing a limited group of annotators' moral stances and lacking\nexplainability. In contrast, top-down approaches make moral judgments grounded\nin a set of principles. However, it remains conceptual due to the incapability\nof previous language models and the unsolved debate among moral principles. In\nthis study, we propose a flexible framework to steer Large Language Models\n(LLMs) to perform moral reasoning with well-established moral theories from\ninterdisciplinary research. The theory-guided top-down framework can\nincorporate various moral theories. Our experiments demonstrate the\neffectiveness of the proposed framework on datasets derived from moral\ntheories. Furthermore, we show the alignment between different moral theories\nand existing morality datasets. Our analysis exhibits the potentials and flaws\nin existing resources (models and datasets) in developing explainable moral\njudgment-making systems.\n","authors":["Jingyan Zhou","Minda Hu","Junan Li","Xiaoying Zhang","Xixin Wu","Irwin King","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.15399v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2308.12896v2","updated":"2023-08-29T15:57:02Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v2.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.14641v2","updated":"2023-08-29T15:48:23Z","published":"2023-08-28T15:12:34Z","title":"Challenges of GPT-3-based Conversational Agents for Healthcare","summary":" The potential to provide patients with faster information access while\nallowing medical specialists to concentrate on critical tasks makes medical\ndomain dialog agents appealing. However, the integration of large-language\nmodels (LLMs) into these agents presents certain limitations that may result in\nserious consequences. This paper investigates the challenges and risks of using\nGPT-3-based models for medical question-answering (MedQA). We perform several\nevaluations contextualized in terms of standard medical principles. We provide\na procedure for manually designing patient queries to stress-test high-risk\nlimitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to\nrespond adequately to these queries, generating erroneous medical information,\nunsafe recommendations, and content that may be considered offensive.\n","authors":["Fabian Lechner","Allison Lahnala","Charles Welch","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.14641v2.pdf","comment":"12 pages, 9 Tables, accepted to RANLP 2023"},{"id":"http://arxiv.org/abs/2304.11073v2","updated":"2023-08-29T15:02:08Z","published":"2023-04-20T09:30:50Z","title":"OLISIA: a Cascade System for Spoken Dialogue State Tracking","summary":" Though Dialogue State Tracking (DST) is a core component of spoken dialogue\nsystems, recent work on this task mostly deals with chat corpora, disregarding\nthe discrepancies between spoken and written language.In this paper, we propose\nOLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR)\nmodel and a DST model. We introduce several adaptations in the ASR and DST\nmodules to improve integration and robustness to spoken conversations.With\nthese adaptations, our system ranked first in DSTC11 Track 3, a benchmark to\nevaluate spoken DST. We conduct an in-depth analysis of the results and find\nthat normalizing the ASR outputs and adapting the DST inputs through data\naugmentation, along with increasing the pre-trained models size all play an\nimportant role in reducing the performance discrepancy between written and\nspoken conversations.\n","authors":["Léo Jacqmin","Lucas Druart","Yannick Estève","Benoît Favre","Lina Maria Rojas-Barahona","Valentin Vielzeuf"],"pdf_url":"https://arxiv.org/pdf/2304.11073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15363v1","updated":"2023-08-29T14:59:54Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborates their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar.\nTowards an efficient and economic LLM-based Text-to-SQL solution, we emphasize\nthe token efficiency in prompt engineering and compare the prior studies under\nthis metric. Additionally, we investigate open-source LLMs in in-context\nlearning, and further enhance their performance with task-specific supervised\nfine-tuning. Our explorations highlight open-source LLMs' potential in\nText-to-SQL, as well as the advantages and disadvantages of the task-specific\nsupervised fine-tuning. We hope that our work provides a deeper understanding\nof Text-to-SQL with LLMs, and inspire further investigations and broad\napplications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v1.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2302.02083v4","updated":"2023-08-29T14:55:37Z","published":"2023-02-04T03:50:01Z","title":"Theory of Mind Might Have Spontaneously Emerged in Large Language Models","summary":" We explore the intriguing possibility that theory of mind (ToM), or the\nuniquely human ability to impute unobservable mental states to others, might\nhave spontaneously emerged in large language models (LLMs). We designed 40\nfalse-belief tasks, considered a gold standard in testing ToM in humans, and\nadministered them to several LLMs. Each task included a false-belief scenario,\nthree closely matched true-belief controls, and the reversed versions of all\nfour. Smaller and older models solved no tasks; GPT-3-davinci-001 (from May\n2020) and GPT-3-davinci-002 (from January 2022) solved 10%; and\nGPT-3-davinci-003 (from November 2022) and ChatGPT-3.5-turbo (from March 2023)\nsolved 35% of the tasks, mirroring the performance of three-year-old children.\nChatGPT-4 (from June 2023) solved 90% of the tasks, matching the performance of\nseven-year-old children. These findings suggest the intriguing possibility that\nToM, previously considered exclusive to humans, may have spontaneously emerged\nas a byproduct of LLMs' improving language skills.\n","authors":["Michal Kosinski"],"pdf_url":"https://arxiv.org/pdf/2302.02083v4.pdf","comment":"TRY RUNNING ToM EXPERIMENTS ON YOUR OWN: The code and tasks used in\n this study are available at Colab\n (https://colab.research.google.com/drive/1ZRtmw87CdA4xp24DNS_Ik_uA2ypaRnoU).\n Don't worry if you are not an expert coder, you should be able to run this\n code with no-to-minimum Python skills. Or copy-paste the tasks to ChatGPT's\n web interface"},{"id":"http://arxiv.org/abs/2308.15352v1","updated":"2023-08-29T14:47:08Z","published":"2023-08-29T14:47:08Z","title":"Historical patterns of rice farming explain modern-day language use in\n China and Japan more than modernization and urbanization","summary":" We used natural language processing to analyze a billion words to study\ncultural differences on Weibo, one of China's largest social media platforms.\nWe compared predictions from two common explanations about cultural differences\nin China (economic development and urban-rural differences) against the\nless-obvious legacy of rice versus wheat farming. Rice farmers had to\ncoordinate shared irrigation networks and exchange labor to cope with higher\nlabor requirements. In contrast, wheat relied on rainfall and required half as\nmuch labor. We test whether this legacy made southern China more\ninterdependent. Across all word categories, rice explained twice as much\nvariance as economic development and urbanization. Rice areas used more words\nreflecting tight social ties, holistic thought, and a cautious, prevention\norientation. We then used Twitter data comparing prefectures in Japan, which\nlargely replicated the results from China. This provides crucial evidence of\nthe rice theory in a different nation, language, and platform.\n","authors":["Sharath Chandra Guntuku","Thomas Talhelm","Garrick Sherman","Angel Fan","Salvatore Giorgi","Liuqing Wei","Lyle H. Ungar"],"pdf_url":"https://arxiv.org/pdf/2308.15352v1.pdf","comment":"Includes Supplemental Materials"},{"id":"http://arxiv.org/abs/2308.15334v1","updated":"2023-08-29T14:29:57Z","published":"2023-08-29T14:29:57Z","title":"A Framework for Responsible Development of Automated Student Feedback\n with Generative AI","summary":" Providing rich feedback to students is essential for supporting student\nlearning. Recent advances in generative AI, particularly within large language\nmodelling (LLM), provide the opportunity to deliver repeatable, scalable and\ninstant automatically generated feedback to students, making abundant a\npreviously scarce and expensive learning resource. Such an approach is feasible\nfrom a technical perspective due to these recent advances in Artificial\nIntelligence (AI) and Natural Language Processing (NLP); while the potential\nupside is a strong motivator, doing so introduces a range of potential ethical\nissues that must be considered as we apply these technologies. The\nattractiveness of AI systems is that they can effectively automate the most\nmundane tasks; but this risks introducing a \"tyranny of the majority\", where\nthe needs of minorities in the long tail are overlooked because they are\ndifficult to automate.\n Developing machine learning models that can generate valuable and authentic\nfeedback requires the input of human domain experts. The choices we make in\ncapturing this expertise -- whose, which, when, and how -- will have\nsignificant consequences for the nature of the resulting feedback. How we\nmaintain our models will affect how that feedback remains relevant given\ntemporal changes in context, theory, and prior learning profiles of student\ncohorts. These questions are important from an ethical perspective; but they\nare also important from an operational perspective. Unless they can be\nanswered, our AI generated systems will lack the trust necessary for them to be\nuseful features in the contemporary learning environment.\n This article will outline the frontiers of automated feedback, identify the\nethical issues involved in the provision of automated feedback and present a\nframework to assist academics to develop such systems responsibly.\n","authors":["Euan D Lindsay","Aditya Johri","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2308.15334v1.pdf","comment":"10 pages, under review at IEEE TLT"},{"id":"http://arxiv.org/abs/2308.04645v2","updated":"2023-08-29T14:09:49Z","published":"2023-08-09T01:02:06Z","title":"Cross-Lingual Constituency Parsing for Middle High German: A\n Delexicalized Approach","summary":" Constituency parsing plays a fundamental role in advancing natural language\nprocessing (NLP) tasks. However, training an automatic syntactic analysis\nsystem for ancient languages solely relying on annotated parse data is a\nformidable task due to the inherent challenges in building treebanks for such\nlanguages. It demands extensive linguistic expertise, leading to a scarcity of\navailable resources. To overcome this hurdle, cross-lingual transfer techniques\nwhich require minimal or even no annotated data for low-resource target\nlanguages offer a promising solution. In this study, we focus on building a\nconstituency parser for $\\mathbf{M}$iddle $\\mathbf{H}$igh $\\mathbf{G}$erman\n($\\mathbf{MHG}$) under realistic conditions, where no annotated MHG treebank is\navailable for training. In our approach, we leverage the linguistic continuity\nand structural similarity between MHG and $\\mathbf{M}$odern $\\mathbf{G}$erman\n($\\mathbf{MG}$), along with the abundance of MG treebank resources.\nSpecifically, by employing the $\\mathit{delexicalization}$ method, we train a\nconstituency parser on MG parse datasets and perform cross-lingual transfer to\nMHG parsing. Our delexicalized constituency parser demonstrates remarkable\nperformance on the MHG test set, achieving an F1-score of 67.3%. It outperforms\nthe best zero-shot cross-lingual baseline by a margin of 28.6% points. These\nencouraging results underscore the practicality and potential for automatic\nsyntactic analysis in other ancient languages that face similar challenges as\nMHG.\n","authors":["Ercong Nie","Helmut Schmid","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2308.04645v2.pdf","comment":"Accepted to ALP 2023"},{"id":"http://arxiv.org/abs/2305.13862v2","updated":"2023-08-29T13:55:13Z","published":"2023-05-23T09:35:37Z","title":"A Trip Towards Fairness: Bias and De-Biasing in Large Language Models","summary":" Cheap-to-Build Very Large-Language Models (CtB-LLMs) with affordable training\nare emerging as the next big revolution in natural language processing and\nunderstanding. These CtB-LLMs are democratizing access to trainable Very\nLarge-Language Models (VLLMs) and, thus, may represent the building blocks of\nmany NLP systems solving downstream tasks. Hence, a little or a large bias in\nCtB-LLMs may cause huge harm. In this paper, we performed a large investigation\nof the bias of three families of CtB-LLMs, and we showed that debiasing\ntechniques are effective and usable. Indeed, according to current tests, the\nLLaMA and the OPT families have an important bias in gender, race, religion,\nand profession. In contrast to the analysis for other LLMs, we discovered that\nbias depends not on the number of parameters but on the perplexity. Finally,\nthe debiasing of OPT using LoRA reduces bias up to 4.12 points in the\nnormalized stereotype score.\n","authors":["Leonardo Ranaldi","Elena Sofia Ruzzetti","Davide Venditti","Dario Onorati","Fabio Massimo Zanzotto"],"pdf_url":"https://arxiv.org/pdf/2305.13862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15299v1","updated":"2023-08-29T13:36:45Z","published":"2023-08-29T13:36:45Z","title":"TaskLAMA: Probing the Complex Task Understanding of Language Models","summary":" Structured Complex Task Decomposition (SCTD) is the problem of breaking down\na complex real-world task (such as planning a wedding) into a directed acyclic\ngraph over individual steps that contribute to achieving the task, with edges\nspecifying temporal dependencies between them. SCTD is an important component\nof assistive planning tools, and a challenge for commonsense reasoning systems.\nWe probe how accurately SCTD can be done with the knowledge extracted from\nLarge Language Models (LLMs). We introduce a high-quality human-annotated\ndataset for this problem and novel metrics to fairly assess performance of LLMs\nagainst several baselines. Our experiments reveal that LLMs are able to\ndecompose complex tasks into individual steps effectively, with a relative\nimprovement of 15% to 280% over the best baseline. We also propose a number of\napproaches to further improve their performance, with a relative improvement of\n7% to 37% over the base model. However, we find that LLMs still struggle to\npredict pairwise temporal dependencies, which reveals a gap in their\nunderstanding of complex tasks.\n","authors":["Quan Yuan","Mehran Kazemi","Xin Xu","Isaac Noble","Vaiva Imbrasaite","Deepak Ramachandran"],"pdf_url":"https://arxiv.org/pdf/2308.15299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15298v1","updated":"2023-08-29T13:35:51Z","published":"2023-08-29T13:35:51Z","title":"KGConv, a Conversational Corpus grounded in Wikidata","summary":" We present KGConv, a large, conversational corpus of 71k conversations where\neach question-answer pair is grounded in a Wikidata fact. Conversations contain\non average 8.6 questions and for each Wikidata fact, we provide multiple\nvariants (12 on average) of the corresponding question using templates, human\nannotations, hand-crafted rules and a question rewriting neural model. We\nprovide baselines for the task of Knowledge-Based, Conversational Question\nGeneration. KGConv can further be used for other generation and analysis tasks\nsuch as single-turn question generation from Wikidata triples, question\nrewriting, question answering from conversation or from knowledge graphs and\nquiz generation.\n","authors":["Quentin Brabant","Gwenole Lecorve","Lina M. Rojas-Barahona","Claire Gardent"],"pdf_url":"https://arxiv.org/pdf/2308.15298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11167v3","updated":"2023-08-29T13:33:52Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v3.pdf","comment":"V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption\n overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet\n results in Section 4.3 (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2307.09162v2","updated":"2023-08-29T13:15:24Z","published":"2023-07-18T11:38:45Z","title":"Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and\n Addressing Sociological Implications","summary":" Gender bias in artificial intelligence (AI) and natural language processing\nhas garnered significant attention due to its potential impact on societal\nperceptions and biases. This research paper aims to analyze gender bias in\nLarge Language Models (LLMs) with a focus on multiple comparisons between GPT-2\nand GPT-3.5, some prominent language models, to better understand its\nimplications. Through a comprehensive literature review, the study examines\nexisting research on gender bias in AI language models and identifies gaps in\nthe current knowledge. The methodology involves collecting and preprocessing\ndata from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis\ntechniques to evaluate gender bias in the generated text. The findings shed\nlight on gendered word associations, language usage, and biased narratives\npresent in the outputs of these Large Language Models. The discussion explores\nthe ethical implications of gender bias and its potential consequences on\nsocial perceptions and marginalized communities. Additionally, the paper\npresents strategies for reducing gender bias in LLMs, including algorithmic\napproaches and data augmentation techniques. The research highlights the\nimportance of interdisciplinary collaborations and the role of sociological\nstudies in mitigating gender bias in AI models. By addressing these issues, we\ncan pave the way for more inclusive and unbiased AI systems that have a\npositive impact on society.\n","authors":["Vishesh Thakur"],"pdf_url":"https://arxiv.org/pdf/2307.09162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15262v1","updated":"2023-08-29T12:41:50Z","published":"2023-08-29T12:41:50Z","title":"Enhancing OCR Performance through Post-OCR Models: Adopting Glyph\n Embedding for Improved Correction","summary":" The study investigates the potential of post-OCR models to overcome\nlimitations in OCR models and explores the impact of incorporating glyph\nembedding on post-OCR correction performance. In this study, we have developed\nour own post-OCR correction model. The novelty of our approach lies in\nembedding the OCR output using CharBERT and our unique embedding technique,\ncapturing the visual characteristics of characters. Our findings show that\npost-OCR correction effectively addresses deficiencies in inferior OCR models,\nand glyph embedding enables the model to achieve superior results, including\nthe ability to correct individual words.\n","authors":["Yung-Hsin Chen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01681v3","updated":"2023-08-29T12:20:15Z","published":"2023-08-03T10:48:30Z","title":"NBIAS: A Natural Language Processing Framework for Bias Identification\n in Text","summary":" Bias in textual data can lead to skewed interpretations and outcomes when the\ndata is used. These biases could perpetuate stereotypes, discrimination, or\nother forms of unfair treatment. An algorithm trained on biased data may end up\nmaking decisions that disproportionately impact a certain group of people.\nTherefore, it is crucial to detect and remove these biases to ensure the fair\nand ethical use of data. To this end, we develop a comprehensive and robust\nframework NBIAS that consists of four main layers: data, corpus construction,\nmodel development and an evaluation layer. The dataset is constructed by\ncollecting diverse data from various domains, including social media,\nhealthcare, and job hiring portals. As such, we applied a transformer-based\ntoken classification model that is able to identify bias words/ phrases through\na unique named entity BIAS. In the evaluation procedure, we incorporate a blend\nof quantitative and qualitative measures to gauge the effectiveness of our\nmodels. We achieve accuracy improvements ranging from 1% to 8% compared to\nbaselines. We are also able to generate a robust understanding of the model\nfunctioning. The proposed approach is applicable to a variety of biases and\ncontributes to the fair and ethical use of textual data.\n","authors":["Shaina Raza","Muskan Garg","Deepak John Reji","Syed Raza Bashir","Chen Ding"],"pdf_url":"https://arxiv.org/pdf/2308.01681v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.15246v1","updated":"2023-08-29T12:12:53Z","published":"2023-08-29T12:12:53Z","title":"A Classification-Guided Approach for Adversarial Attacks against Neural\n Machine Translation","summary":" Neural Machine Translation (NMT) models have been shown to be vulnerable to\nadversarial attacks, wherein carefully crafted perturbations of the input can\nmislead the target model. In this paper, we introduce ACT, a novel adversarial\nattack framework against NMT systems guided by a classifier. In our attack, the\nadversary aims to craft meaning-preserving adversarial examples whose\ntranslations by the NMT model belong to a different class than the original\ntranslations in the target language. Unlike previous attacks, our new approach\nhas a more substantial effect on the translation by altering the overall\nmeaning, which leads to a different class determined by a classifier. To\nevaluate the robustness of NMT models to this attack, we propose enhancements\nto existing black-box word-replacement-based attacks by incorporating output\ntranslations of the target NMT model and the output logits of a classifier\nwithin the attack process. Extensive experiments in various settings, including\na comparison with existing untargeted attacks, demonstrate that the proposed\nattack is considerably more successful in altering the class of the output\ntranslation and has more effect on the translation. This new paradigm can show\nthe vulnerabilities of NMT systems by focusing on the class of translation\nrather than the mere translation quality as studied traditionally.\n","authors":["Sahar Sadrizadeh","Ljiljana Dolamic","Pascal Frossard"],"pdf_url":"https://arxiv.org/pdf/2308.15246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15235v1","updated":"2023-08-29T11:46:27Z","published":"2023-08-29T11:46:27Z","title":"PronounFlow: A Hybrid Approach for Calibrating Pronouns in Sentences","summary":" Flip through any book or listen to any song lyrics, and you will come across\npronouns that, in certain cases, can hinder meaning comprehension, especially\nfor machines. As the role of having cognitive machines becomes pervasive in our\nlives, numerous systems have been developed to resolve pronouns under various\nchallenges. Commensurate with this, it is believed that having systems able to\ndisambiguate pronouns in sentences will help towards the endowment of machines\nwith commonsense and reasoning abilities like those found in humans. However,\none problem these systems face with modern English is the lack of gender\npronouns, where people try to alternate by using masculine, feminine, or plural\nto avoid the whole issue. Since humanity aims to the building of systems in the\nfull-bodied sense we usually reserve for people, what happens when pronouns in\nwritten text, like plural or epicene ones, refer to unspecified entities whose\ngender is not necessarily known? Wouldn't that put extra barriers to existing\ncoreference resolution systems? Towards answering those questions, through the\nimplementation of a neural-symbolic system that utilizes the best of both\nworlds, we are employing PronounFlow, a system that reads any English sentence\nwith pronouns and entities, identifies which of them are not tied to each\nother, and makes suggestions on which to use to avoid biases. Undertaken\nexperiments show that PronounFlow not only alternates pronouns in sentences\nbased on the collective human knowledge around us but also considerably helps\ncoreference resolution systems with the pronoun disambiguation process.\n","authors":["Nicos Isaak"],"pdf_url":"https://arxiv.org/pdf/2308.15235v1.pdf","comment":"13 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.15232v1","updated":"2023-08-29T11:40:24Z","published":"2023-08-29T11:40:24Z","title":"Classification-Aware Neural Topic Model Combined With Interpretable\n Analysis -- For Conflict Classification","summary":" A large number of conflict events are affecting the world all the time. In\norder to analyse such conflict events effectively, this paper presents a\nClassification-Aware Neural Topic Model (CANTM-IA) for Conflict Information\nClassification and Topic Discovery. The model provides a reliable\ninterpretation of classification results and discovered topics by introducing\ninterpretability analysis. At the same time, interpretation is introduced into\nthe model architecture to improve the classification performance of the model\nand to allow interpretation to focus further on the details of the data.\nFinally, the model architecture is optimised to reduce the complexity of the\nmodel.\n","authors":["Tianyu Liang","Yida Mu","Soonho Kim","Darline Larissa Kengne Kuate","Julie Lang","Rob Vos","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.15232v1.pdf","comment":"Accepted by RANLP 2023"},{"id":"http://arxiv.org/abs/2308.15231v1","updated":"2023-08-29T11:40:03Z","published":"2023-08-29T11:40:03Z","title":"Multi-party Goal Tracking with LLMs: Comparing Pre-training,\n Fine-tuning, and Prompt Engineering","summary":" This paper evaluates the extent to which current Large Language Models (LLMs)\ncan capture task-oriented multi-party conversations (MPCs). We have recorded\nand transcribed 29 MPCs between patients, their companions, and a social robot\nin a hospital. We then annotated this corpus for multi-party goal-tracking and\nintent-slot recognition. People share goals, answer each other's goals, and\nprovide other people's goals in MPCs - none of which occur in dyadic\ninteractions. To understand user goals in MPCs, we compared three methods in\nzero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks\nto train DialogLM using LED, and employed prompt engineering techniques with\nGPT-3.5-turbo, to determine which approach can complete this novel task with\nlimited data. GPT-3.5-turbo significantly outperformed the others in a few-shot\nsetting. The `reasoning' style prompt, when given 7% of the corpus as example\nannotated conversations, was the best performing method. It correctly annotated\n62.32% of the goal tracking MPCs, and 69.57% of the intent-slot recognition\nMPCs. A `story' style prompt increased model hallucination, which could be\ndetrimental if deployed in safety-critical settings. We conclude that\nmulti-party conversations still challenge state-of-the-art LLMs.\n","authors":["Angus Addlesee","Weronika Sieińska","Nancie Gunson","Daniel Hernández Garcia","Christian Dondrup","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2308.15231v1.pdf","comment":"Accepted and will appear in the Proceedings of SIGdial 2023"},{"id":"http://arxiv.org/abs/2308.15226v1","updated":"2023-08-29T11:29:43Z","published":"2023-08-29T11:29:43Z","title":"CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for\n Multimodal Machine Translation","summary":" There has been a growing interest in developing multimodal machine\ntranslation (MMT) systems that enhance neural machine translation (NMT) with\nvisual knowledge. This problem setup involves using images as auxiliary\ninformation during training, and more recently, eliminating their use during\ninference. Towards this end, previous works face a challenge in training\npowerful MMT models from scratch due to the scarcity of annotated multilingual\nvision-language data, especially for low-resource languages. Simultaneously,\nthere has been an influx of multilingual pre-trained models for NMT and\nmultimodal pre-trained models for vision-language tasks, primarily in English,\nwhich have shown exceptional generalisation ability. However, these are not\ndirectly applicable to MMT since they do not provide aligned multimodal\nmultilingual features for generative tasks. To alleviate this issue, instead of\ndesigning complex modules for MMT, we propose CLIPTrans, which simply adapts\nthe independently pre-trained multimodal M-CLIP and the multilingual mBART. In\norder to align their embedding spaces, mBART is conditioned on the M-CLIP\nfeatures by a prefix sequence generated through a lightweight mapping network.\nWe train this in a two-stage pipeline which warms up the model with image\ncaptioning before the actual translation task. Through experiments, we\ndemonstrate the merits of this framework and consequently push forward the\nstate-of-the-art across standard benchmarks by an average of +2.67 BLEU. The\ncode can be found at www.github.com/devaansh100/CLIPTrans.\n","authors":["Devaansh Gupta","Siddhant Kharbanda","Jiawei Zhou","Wanhua Li","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2308.15226v1.pdf","comment":"15 pages, 9 figures, to be published In Proceedings of International\n Conference of Computer Vision(ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.15214v1","updated":"2023-08-29T11:08:40Z","published":"2023-08-29T11:08:40Z","title":"FurChat: An Embodied Conversational Agent using LLMs, Combining Open and\n Closed-Domain Dialogue with Facial Expressions","summary":" We demonstrate an embodied conversational agent that can function as a\nreceptionist and generate a mixture of open and closed-domain dialogue along\nwith facial expressions, by using a large language model (LLM) to develop an\nengaging conversation. We deployed the system onto a Furhat robot, which is\nhighly expressive and capable of using both verbal and nonverbal cues during\ninteraction. The system was designed specifically for the National Robotarium\nto interact with visitors through natural conversations, providing them with\ninformation about the facilities, research, news, upcoming events, etc. The\nsystem utilises the state-of-the-art GPT-3.5 model to generate such information\nalong with domain-general conversations and facial expressions based on prompt\nengineering.\n","authors":["Neeraj Cherakara","Finny Varghese","Sheena Shabana","Nivan Nelson","Abhiram Karukayil","Rohith Kulothungan","Mohammed Afil Farhan","Birthe Nesset","Meriam Moujahid","Tanvi Dinkar","Verena Rieser","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2308.15214v1.pdf","comment":"5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the\n Special Interest Group on Discourse and Dialogue), for the demo video, see\n https://youtu.be/fwtUl1kl22s"},{"id":"http://arxiv.org/abs/2308.15209v1","updated":"2023-08-29T10:55:44Z","published":"2023-08-29T10:55:44Z","title":"Shared Lexical Items as Triggers of Code Switching","summary":" Why do bilingual speakers code-switch (mix their two languages)? Among the\nseveral theories that attempt to explain this natural and ubiquitous\nphenomenon, the Triggering Hypothesis relates code-switching to the presence of\nlexical triggers, specifically cognates and proper names, adjacent to the\nswitch point. We provide a fuller, more nuanced and refined exploration of the\ntriggering hypothesis, based on five large datasets in three language pairs,\nreflecting both spoken and written bilingual interactions. Our results show\nthat words that are assumed to reside in a mental lexicon shared by both\nlanguages indeed trigger code-switching; that the tendency to switch depends on\nthe distance of the trigger from the switch point; and on whether the trigger\nprecedes or succeeds the switch; but not on the etymology of the trigger words.\nWe thus provide strong, robust, evidence-based confirmation to several\nhypotheses on the relationships between lexical triggers and code-switching.\n","authors":["Shuly Wintner","Safaa Shehadi","Yuli Zeira","Doreen Osmelak","Yuval Nov"],"pdf_url":"https://arxiv.org/pdf/2308.15209v1.pdf","comment":"This is the author's final version; the article has been accepted for\n publication in the Transactions of the Association for Computational\n Linguistics (TACL)"},{"id":"http://arxiv.org/abs/2308.15202v1","updated":"2023-08-29T10:40:46Z","published":"2023-08-29T10:40:46Z","title":"Benchmarking the Generation of Fact Checking Explanations","summary":" Fighting misinformation is a challenging, yet crucial, task. Despite the\ngrowing number of experts being involved in manual fact-checking, this activity\nis time-consuming and cannot keep up with the ever-increasing amount of Fake\nNews produced daily. Hence, automating this process is necessary to help curb\nmisinformation. Thus far, researchers have mainly focused on claim veracity\nclassification. In this paper, instead, we address the generation of\njustifications (textual explanation of why a claim is classified as either true\nor false) and benchmark it with novel datasets and advanced baselines. In\nparticular, we focus on summarization approaches over unstructured knowledge\n(i.e. news articles) and we experiment with several extractive and abstractive\nstrategies. We employed two datasets with different styles and structures, in\norder to assess the generalizability of our findings. Results show that in\njustification production summarization benefits from the claim information,\nand, in particular, that a claim-driven extractive step improves abstractive\nsummarization performances. Finally, we show that although cross-dataset\nexperiments suffer from performance degradation, a unique model trained on a\ncombination of the two datasets is able to retain style information in an\nefficient manner.\n","authors":["Daniel Russo","Serra Sinem Tekiroglu","Marco Guerini"],"pdf_url":"https://arxiv.org/pdf/2308.15202v1.pdf","comment":"Accepted to TACL. This arXiv version is a pre-MIT Press publication\n version"},{"id":"http://arxiv.org/abs/2308.15192v1","updated":"2023-08-29T10:20:53Z","published":"2023-08-29T10:20:53Z","title":"Enhancing Psychological Counseling with Large Language Model: A\n Multifaceted Decision-Support System for Non-Professionals","summary":" In the contemporary landscape of social media, an alarming number of users\nexpress negative emotions, some of which manifest as strong suicidal\nintentions. This situation underscores a profound need for trained\npsychological counselors who can enact effective mental interventions. However,\nthe development of these professionals is often an imperative but\ntime-consuming task. Consequently, the mobilization of non-professionals or\nvolunteers in this capacity emerges as a pressing concern. Leveraging the\ncapabilities of artificial intelligence, and in particular, the recent advances\nin large language models, offers a viable solution to this challenge. This\npaper introduces a novel model constructed on the foundation of large language\nmodels to fully assist non-professionals in providing psychological\ninterventions on online user discourses. This framework makes it plausible to\nharness the power of non-professional counselors in a meaningful way. A\ncomprehensive study was conducted involving ten professional psychological\ncounselors of varying expertise, evaluating the system across five critical\ndimensions. The findings affirm that our system is capable of analyzing\npatients' issues with relative accuracy and proffering professional-level\nstrategies recommendations, thereby enhancing support for non-professionals.\nThis research serves as a compelling validation of the application of large\nlanguage models in the field of psychology and lays the groundwork for a new\nparadigm of community-based mental health support.\n","authors":["Guanghui Fu","Qing Zhao","Jianqiang Li","Dan Luo","Changwei Song","Wei Zhai","Shuo Liu","Fan Wang","Yan Wang","Lijuan Cheng","Juan Zhang","Bing Xiang Yang"],"pdf_url":"https://arxiv.org/pdf/2308.15192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15154v1","updated":"2023-08-29T09:35:23Z","published":"2023-08-29T09:35:23Z","title":"The Anatomy of Conspirators: Unveiling Traits using a Comprehensive\n Twitter Dataset","summary":" The discourse around conspiracy theories is currently thriving amidst the\nrampant misinformation prevalent in online environments. Research in this field\nhas been focused on detecting conspiracy theories on social media, often\nrelying on limited datasets. In this study, we present a novel methodology for\nconstructing a Twitter dataset that encompasses accounts engaged in\nconspiracy-related activities throughout the year 2022. Our approach centers on\ndata collection that is independent of specific conspiracy theories and\ninformation operations. Additionally, our dataset includes a control group\ncomprising randomly selected users who can be fairly compared to the\nindividuals involved in conspiracy activities. This comprehensive collection\neffort yielded a total of 15K accounts and 37M tweets extracted from their\ntimelines. We conduct a comparative analysis of the two groups across three\ndimensions: topics, profiles, and behavioral characteristics. The results\nindicate that conspiracy and control users exhibit similarity in terms of their\nprofile metadata characteristics. However, they diverge significantly in terms\nof behavior and activity, particularly regarding the discussed topics, the\nterminology used, and their stance on trending subjects. Interestingly, there\nis no significant disparity in the presence of bot users between the two\ngroups, suggesting that conspiracy and automation are orthogonal concepts.\nFinally, we develop a classifier to identify conspiracy users using 93\nfeatures, some of which are commonly employed in literature for troll\nidentification. The results demonstrate a high accuracy level (with an average\nF1 score of 0.98%), enabling us to uncover the most discriminative features\nassociated with conspiracy-related accounts.\n","authors":["Margherita Gambini","Serena Tardelli","Maurizio Tesconi"],"pdf_url":"https://arxiv.org/pdf/2308.15154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15126v1","updated":"2023-08-29T08:51:24Z","published":"2023-08-29T08:51:24Z","title":"Evaluation and Analysis of Hallucination in Large Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) have recently achieved remarkable\nsuccess. However, LVLMs are still plagued by the hallucination problem, which\nlimits the practicality in many scenarios. Hallucination refers to the\ninformation of LVLMs' responses that does not exist in the visual input, which\nposes potential risks of substantial consequences. There has been limited work\nstudying hallucination evaluation in LVLMs. In this paper, we propose\nHallucination Evaluation based on Large Language Models (HaELM), an LLM-based\nhallucination evaluation framework. HaELM achieves an approximate 95%\nperformance comparable to ChatGPT and has additional advantages including low\ncost, reproducibility, privacy preservation and local deployment. Leveraging\nthe HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we\nanalyze the factors contributing to hallucination in LVLMs and offer helpful\nsuggestions to mitigate the hallucination problem. Our training data and human\nannotation hallucination data will be made public soon.\n","authors":["Junyang Wang","Yiyang Zhou","Guohai Xu","Pengcheng Shi","Chenlin Zhao","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Jihua Zhu","Jitao Sang","Haoyu Tang"],"pdf_url":"https://arxiv.org/pdf/2308.15126v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.15122v1","updated":"2023-08-29T08:41:16Z","published":"2023-08-29T08:41:16Z","title":"SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge\n Distillation from BERT","summary":" Spiking neural networks (SNNs) offer a promising avenue to implement deep\nneural networks in a more energy-efficient way. However, the network\narchitectures of existing SNNs for language tasks are too simplistic, and deep\narchitectures have not been fully explored, resulting in a significant\nperformance gap compared to mainstream transformer-based networks such as BERT.\nTo this end, we improve a recently-proposed spiking transformer (i.e.,\nSpikformer) to make it possible to process language tasks and propose a\ntwo-stage knowledge distillation method for training it, which combines\npre-training by distilling knowledge from BERT with a large collection of\nunlabelled texts and fine-tuning with task-specific instances via knowledge\ndistillation again from the BERT fine-tuned on the same training examples.\nThrough extensive experimentation, we show that the models trained with our\nmethod, named SpikeBERT, outperform state-of-the-art SNNs and even achieve\ncomparable results to BERTs on text classification tasks for both English and\nChinese with much less energy consumption.\n","authors":["Changze Lv","Tianlong Li","Jianhan Xu","Chenxi Gu","Zixuan Ling","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15118v1","updated":"2023-08-29T08:36:30Z","published":"2023-08-29T08:36:30Z","title":"Large Language Models on the Chessboard: A Study on ChatGPT's Formal\n Language Comprehension and Complex Reasoning Skills","summary":" While large language models have made strides in natural language processing,\ntheir proficiency in complex reasoning tasks requiring formal language\ncomprehension, such as chess, remains less investigated. This paper probes the\nperformance of ChatGPT, a sophisticated language model by OpenAI in tackling\nsuch complex reasoning tasks, using chess as a case study. Through robust\nmetrics examining both the legality and quality of moves, we assess ChatGPT's\nunderstanding of the chessboard, adherence to chess rules, and strategic\ndecision-making abilities. Our evaluation identifies limitations within\nChatGPT's attention mechanism that affect its formal language comprehension and\nuncovers the model's underdeveloped self-regulation abilities. Our study also\nreveals ChatGPT's propensity for a coherent strategy in its gameplay and a\nnoticeable uptick in decision-making assertiveness when the model is presented\nwith a greater volume of natural language or possesses a more lucid\nunderstanding of the state of the chessboard. These findings contribute to the\ngrowing exploration of language models' abilities beyond natural language\nprocessing, providing valuable information for future research towards models\ndemonstrating human-like cognitive abilities.\n","authors":["Mu-Tien Kuo","Chih-Chung Hsueh","Richard Tzong-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2308.15118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15097v1","updated":"2023-08-29T08:07:26Z","published":"2023-08-29T08:07:26Z","title":"Sequential annotations for naturally-occurring HRI: first insights","summary":" We explain the methodology we developed for improving the interactions\naccomplished by an embedded conversational agent, drawing from Conversation\nAnalytic sequential and multimodal analysis. The use case is a Pepper robot\nthat is expected to inform and orient users in a library. In order to propose\nand learn better interactive schema, we are creating a corpus of\nnaturally-occurring interactions that will be made available to the community.\nTo do so, we propose an annotation practice based on some theoretical\nunderpinnings about the use of language and multimodal resources in human-robot\ninteraction. CCS CONCEPTS $\\bullet$ Computing methodologies $\\rightarrow$\nDiscourse, dialogue and pragmatics; $\\bullet$ Human-centered computing\n$\\rightarrow$ Text input; HCI theory, concepts and models; Field studies.\n","authors":["Lucien Tisserand","Frédéric Armetta","Heike Baldauf-Quilliatre","Antoine Bouquin","Salima Hassas","Mathieu Lefort"],"pdf_url":"https://arxiv.org/pdf/2308.15097v1.pdf","comment":"Peer-reviewed workshop paper accepted for the ''Human-Robot\n Conversational Interaction'' workshop that took place at the ''ACM/IEEE\n International Conference on Human-Robot Interaction'' 2023 Conference in\n Stockholm, Sweden"},{"id":"http://arxiv.org/abs/2308.15090v1","updated":"2023-08-29T07:53:17Z","published":"2023-08-29T07:53:17Z","title":"Killing two birds with one stone: Can an audio captioning system also be\n used for audio-text retrieval?","summary":" Automated Audio Captioning (AAC) aims to develop systems capable of\ndescribing an audio recording using a textual sentence. In contrast, Audio-Text\nRetrieval (ATR) systems seek to find the best matching audio recording(s) for a\ngiven textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks\nrequire different types of systems: AAC employs a sequence-to-sequence model,\nwhile ATR utilizes a ranking model that compares audio and text representations\nwithin a shared projection subspace. However, this work investigates the\nrelationship between AAC and ATR by exploring the ATR capabilities of an\nunmodified AAC system, without fine-tuning for the new task. Our AAC system\nconsists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio\ntagging, and a transformer decoder responsible for generating sentences. For\nAAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on\nAudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss\nvalues obtained for any audio/caption pair. Experimental results on the Clotho\nand AudioCaps datasets demonstrate decent recall values using this simple\napproach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for\nAu-dioCaps, which is above the current state-of-the-art method without external\ndata. Interestingly, we observe that normalizing the loss values was necessary\nfor Audio-to-Text retrieval.\n","authors":["Etienne Labbé","Thomas Pellegrini","Julien Pinquier"],"pdf_url":"https://arxiv.org/pdf/2308.15090v1.pdf","comment":"cam ready version (14/08/23)"},{"id":"http://arxiv.org/abs/2305.10666v2","updated":"2023-08-29T07:16:52Z","published":"2023-05-18T02:57:54Z","title":"a unified front-end framework for english text-to-speech synthesis","summary":" The front-end is a critical component of English text-to-speech (TTS)\nsystems, responsible for extracting linguistic features that are essential for\na text-to-speech model to synthesize speech, such as prosodies and phonemes.\nThe English TTS front-end typically consists of a text normalization (TN)\nmodule, a prosody word prosody phrase (PWPP) module, and a grapheme-to-phoneme\n(G2P) module. However, current research on the English TTS front-end focuses\nsolely on individual modules, neglecting the interdependence between them and\nresulting in sub-optimal performance for each module. Therefore, this paper\nproposes a unified front-end framework that captures the dependencies among the\nEnglish TTS front-end modules. Extensive experiments have demonstrated that the\nproposed method achieves state-of-the-art (SOTA) performance in all modules.\n","authors":["Zelin Ying","Chen Li","Yu Dong","Qiuqiang Kong","Qiao Tian","Yuanyuan Huo","Yuxuan Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10666v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.15055v1","updated":"2023-08-29T06:31:21Z","published":"2023-08-29T06:31:21Z","title":"Taxonomic Loss for Morphological Glossing of Low-Resource Languages","summary":" Morpheme glossing is a critical task in automated language documentation and\ncan benefit other downstream applications greatly. While state-of-the-art\nglossing systems perform very well for languages with large amounts of existing\ndata, it is more difficult to create useful models for low-resource languages.\nIn this paper, we propose the use of a taxonomic loss function that exploits\nmorphological information to make morphological glossing more performant when\ndata is scarce. We find that while the use of this loss function does not\noutperform a standard loss function with regards to single-label prediction\naccuracy, it produces better predictions when considering the top-n predicted\nlabels. We suggest this property makes the taxonomic loss function useful in a\nhuman-in-the-loop annotation setting.\n","authors":["Michael Ginn","Alexis Palmer"],"pdf_url":"https://arxiv.org/pdf/2308.15055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15053v1","updated":"2023-08-29T06:27:58Z","published":"2023-08-29T06:27:58Z","title":"Adapting text-based dialogue state tracker for spoken dialogues","summary":" Although there have been remarkable advances in dialogue systems through the\ndialogue systems technology competition (DSTC), it remains one of the key\nchallenges to building a robust task-oriented dialogue system with a speech\ninterface. Most of the progress has been made for text-based dialogue systems\nsince there are abundant datasets with written corpora while those with spoken\ndialogues are very scarce. However, as can be seen from voice assistant systems\nsuch as Siri and Alexa, it is of practical importance to transfer the success\nto spoken dialogues. In this paper, we describe our engineering effort in\nbuilding a highly successful model that participated in the speech-aware\ndialogue systems technology challenge track in DSTC11. Our model consists of\nthree major modules: (1) automatic speech recognition error correction to\nbridge the gap between the spoken and the text utterances, (2) text-based\ndialogue system (D3ST) for estimating the slots and values using slot\ndescriptions, and (3) post-processing for recovering the error of the estimated\nslot value. Our experiments show that it is important to use an explicit\nautomatic speech recognition error correction module, post-processing, and data\naugmentation to adapt a text-based dialogue state tracker for spoken dialogue\ncorpora.\n","authors":["Jaeseok Yoon","Seunghyun Hwang","Ran Han","Jeonguk Bang","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15053v1.pdf","comment":"8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at\n SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2308.15047v1","updated":"2023-08-29T06:09:47Z","published":"2023-08-29T06:09:47Z","title":"Large language models converge toward human-like concept organization","summary":" Large language models show human-like performance in knowledge extraction,\nreasoning and dialogue, but it remains controversial whether this performance\nis best explained by memorization and pattern matching, or whether it reflects\nhuman-like inferential semantics and world knowledge. Knowledge bases such as\nWikiData provide large-scale, high-quality representations of inferential\nsemantics and world knowledge. We show that large language models learn to\norganize concepts in ways that are strikingly similar to how concepts are\norganized in such knowledge bases. Knowledge bases model collective,\ninstitutional knowledge, and large language models seem to induce such\nknowledge from raw text. We show that bigger and better models exhibit more\nhuman-like concept organization, across four families of language models and\nthree knowledge graph embeddings.\n","authors":["Mathias Lykke Gammelgaard","Jonathan Gabel Christiansen","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2308.15047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12095v5","updated":"2023-08-29T05:34:25Z","published":"2023-02-22T11:01:20Z","title":"On the Robustness of ChatGPT: An Adversarial and Out-of-distribution\n Perspective","summary":" ChatGPT is a recent chatbot service released by OpenAI and is receiving\nincreasing attention over the past few months. While evaluations of various\naspects of ChatGPT have been done, its robustness, i.e., the performance to\nunexpected inputs, is still unclear to the public. Robustness is of particular\nconcern in responsible AI, especially for safety-critical applications. In this\npaper, we conduct a thorough evaluation of the robustness of ChatGPT from the\nadversarial and out-of-distribution (OOD) perspective. To do so, we employ the\nAdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart\nreview and DDXPlus medical diagnosis datasets for OOD evaluation. We select\nseveral popular foundation models as baselines. Results show that ChatGPT shows\nconsistent advantages on most adversarial and OOD classification and\ntranslation tasks. However, the absolute performance is far from perfection,\nwhich suggests that adversarial and OOD robustness remains a significant threat\nto foundation models. Moreover, ChatGPT shows astounding performance in\nunderstanding dialogue-related texts and we find that it tends to provide\ninformal suggestions for medical tasks instead of definitive answers. Finally,\nwe present in-depth discussions of possible research directions.\n","authors":["Jindong Wang","Xixu Hu","Wenxin Hou","Hao Chen","Runkai Zheng","Yidong Wang","Linyi Yang","Haojun Huang","Wei Ye","Xiubo Geng","Binxin Jiao","Yue Zhang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2302.12095v5.pdf","comment":"Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable\n Large-Scale Machine Learning Models; code is at:\n https://github.com/microsoft/robustlearn; more works:\n https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2308.15027v1","updated":"2023-08-29T05:18:47Z","published":"2023-08-29T05:18:47Z","title":"Improving Neural Ranking Models with Traditional IR Methods","summary":" Neural ranking methods based on large transformer models have recently gained\nsignificant attention in the information retrieval community, and have been\nadopted by major commercial solutions. Nevertheless, they are computationally\nexpensive to create, and require a great deal of labeled data for specialized\ncorpora. In this paper, we explore a low resource alternative which is a\nbag-of-embedding model for document retrieval and find that it is competitive\nwith large transformer models fine tuned on information retrieval tasks. Our\nresults show that a simple combination of TF-IDF, a traditional keyword\nmatching method, with a shallow embedding model provides a low cost path to\ncompete well with the performance of complex neural ranking models on 3\ndatasets. Furthermore, adding TF-IDF measures improves the performance of\nlarge-scale fine tuned models on these tasks.\n","authors":["Anik Saha","Oktie Hassanzadeh","Alex Gittens","Jian Ni","Kavitha Srinivas","Bulent Yener"],"pdf_url":"https://arxiv.org/pdf/2308.15027v1.pdf","comment":"Short paper, 4 pages"},{"id":"http://arxiv.org/abs/2308.15022v1","updated":"2023-08-29T04:59:53Z","published":"2023-08-29T04:59:53Z","title":"Recursively Summarizing Enables Long-Term Dialogue Memory in Large\n Language Models","summary":" Most open-domain dialogue systems suffer from forgetting important\ninformation, especially in a long-term conversation. Existing works usually\ntrain the specific retriever or summarizer to obtain key information from the\npast, which is time-consuming and highly depends on the quality of labeled\ndata. To alleviate this problem, we propose to recursively generate summaries/\nmemory using large language models (LLMs) to enhance long-term memory ability.\nSpecifically, our method first stimulates LLMs to memorize small dialogue\ncontexts and then recursively produce new memory using previous memory and\nfollowing contexts. Finally, the LLM can easily generate a highly consistent\nresponse with the help of the latest memory. We evaluate our method using\nChatGPT and text-davinci-003, and the experiments on the widely-used public\ndataset show that our method can generate more consistent responses in a\nlong-context conversation. Notably, our method is a potential solution to\nenable the LLM to model the extremely long context. Code and scripts will be\nreleased later.\n","authors":["Qingyue Wang","Liang Ding","Yanan Cao","Zhiliang Tian","Shi Wang","Dacheng Tao","Li Guo"],"pdf_url":"https://arxiv.org/pdf/2308.15022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15010v1","updated":"2023-08-29T04:16:57Z","published":"2023-08-29T04:16:57Z","title":"TransPrompt v2: A Transferable Prompting Framework for Cross-task Text\n Classification","summary":" Text classification is one of the most imperative tasks in natural language\nprocessing (NLP). Recent advances with pre-trained language models (PLMs) have\nshown remarkable success on this task. However, the satisfying results obtained\nby PLMs heavily depend on the large amounts of task-specific labeled data,\nwhich may not be feasible in many application scenarios due to data access and\nprivacy constraints. The recently-proposed prompt-based fine-tuning paradigm\nimproves the performance of PLMs for few-shot text classification with\ntask-specific templates. Yet, it is unclear how the prompting knowledge can be\ntransferred across tasks, for the purpose of mutual reinforcement. We propose\nTransPrompt v2, a novel transferable prompting framework for few-shot learning\nacross similar or distant text classification tasks. For learning across\nsimilar tasks, we employ a multi-task meta-knowledge acquisition (MMA)\nprocedure to train a meta-learner that captures the cross-task transferable\nknowledge. For learning across distant tasks, we further inject the task type\ndescriptions into the prompt, and capture the intra-type and inter-type prompt\nembeddings among multiple distant tasks. Additionally, two de-biasing\ntechniques are further designed to make the trained meta-learner more\ntask-agnostic and unbiased towards any tasks. After that, the meta-learner can\nbe adapted to each specific task with better parameters initialization.\nExtensive experiments show that TransPrompt v2 outperforms single-task and\ncross-task strong baselines over multiple NLP tasks and datasets. We further\nshow that the meta-learner can effectively improve the performance of PLMs on\npreviously unseen tasks. In addition, TransPrompt v2 also outperforms strong\nfine-tuning baselines when learning with full training sets.\n","authors":["Jianing Wang","Chengyu Wang","Cen Chen","Ming Gao","Jun Huang","Aoying Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09539v2","updated":"2023-08-29T01:08:30Z","published":"2023-06-15T22:48:08Z","title":"Block-State Transformer","summary":" State space models (SSMs) have shown impressive results on tasks that require\nmodeling long-range dependencies and efficiently scale to long sequences owing\nto their subquadratic runtime complexity. Originally designed for continuous\nsignals, SSMs have shown superior performance on a plethora of tasks, in vision\nand audio; however, SSMs still lag Transformer performance in Language Modeling\ntasks. In this work, we propose a hybrid layer named Block-State Transformer\n(BST), that internally combines an SSM sublayer for long-range\ncontextualization, and a Block Transformer sublayer for short-term\nrepresentation of sequences. We study three different, and completely\nparallelizable, variants that integrate SSMs and block-wise attention. We show\nthat our model outperforms similar Transformer-based architectures on language\nmodeling perplexity and generalizes to longer sequences. In addition, the\nBlock-State Transformer demonstrates more than tenfold increase in speed at the\nlayer level compared to the Block-Recurrent Transformer when model\nparallelization is employed.\n","authors":["Mahan Fathi","Jonathan Pilault","Pierre-Luc Bacon","Christopher Pal","Orhan Firat","Ross Goroshin"],"pdf_url":"https://arxiv.org/pdf/2306.09539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07224v3","updated":"2023-08-29T00:56:16Z","published":"2023-05-12T03:31:24Z","title":"Asymmetric feature interaction for interpreting model predictions","summary":" In natural language processing (NLP), deep neural networks (DNNs) could model\ncomplex interactions between context and have achieved impressive results on a\nrange of NLP tasks. Prior works on feature interaction attribution mainly focus\non studying symmetric interaction that only explains the additional influence\nof a set of words in combination, which fails to capture asymmetric influence\nthat contributes to model prediction. In this work, we propose an asymmetric\nfeature interaction attribution explanation model that aims to explore\nasymmetric higher-order feature interactions in the inference of deep neural\nNLP models. By representing our explanation with an directed interaction graph,\nwe experimentally demonstrate interpretability of the graph to discover\nasymmetric feature interactions. Experimental results on two sentiment\nclassification datasets show the superiority of our model against the\nstate-of-the-art feature interaction attribution methods in identifying\ninfluential features for model predictions. Our code is available at\nhttps://github.com/StillLu/ASIV.\n","authors":["Xiaolei Lu","Jianghong Ma","Haode Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.07224v3.pdf","comment":"Accepted by Findings of the Association for Computational\n Linguistics: ACL 2023 (long paper)"},{"id":"http://arxiv.org/abs/2308.14951v1","updated":"2023-08-29T00:44:27Z","published":"2023-08-29T00:44:27Z","title":"Robust Open-Set Spoken Language Identification and the CU MultiLang\n Dataset","summary":" Most state-of-the-art spoken language identification models are closed-set;\nin other words, they can only output a language label from the set of classes\nthey were trained on. Open-set spoken language identification systems, however,\ngain the ability to detect when an input exhibits none of the original\nlanguages. In this paper, we implement a novel approach to open-set spoken\nlanguage identification that uses MFCC and pitch features, a TDNN model to\nextract meaningful feature embeddings, confidence thresholding on softmax\noutputs, and LDA and pLDA for learning to classify new unknown languages. We\npresent a spoken language identification system that achieves 91.76% accuracy\non trained languages and has the capability to adapt to unknown languages on\nthe fly. To that end, we also built the CU MultiLang Dataset, a large and\ndiverse multilingual speech corpus which was used to train and evaluate our\nsystem.\n","authors":["Mustafa Eyceoz","Justin Lee","Siddharth Pittie","Homayoon Beigi"],"pdf_url":"https://arxiv.org/pdf/2308.14951v1.pdf","comment":"6pages, 1 table, 6 figures"},{"id":"http://arxiv.org/abs/2307.08303v3","updated":"2023-08-29T21:52:58Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n Models","summary":" Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v3.pdf","comment":"fix typos"},{"id":"http://arxiv.org/abs/2308.14306v2","updated":"2023-08-29T20:10:50Z","published":"2023-08-28T04:57:07Z","title":"Evaluating the Robustness to Instructions of Large Language Models","summary":" Recently, Instruction fine-tuning has risen to prominence as a potential\nmethod for enhancing the zero-shot capabilities of Large Language Models (LLMs)\non novel tasks. This technique has shown an exceptional ability to boost the\nperformance of moderately sized LLMs, sometimes even reaching performance\nlevels comparable to those of much larger model variants. The focus is on the\nrobustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an\nexploration of six models including Alpaca, Vicuna, WizardLM, and Traditional\nTask-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction\ndatasets as case studies. We carried out a comprehensive evaluation of these\ninstruction-following LLMs which have been tuned based on open-domain\ninstructions and task-oriented instructions. The main discussion is their\nperformance and robustness towards instructions. We have observed that in most\ncases, the model's performance in dealing with unfamiliar instructions tends to\nworsen significantly, and the robustness of the model for RE instructions\ndeteriorates compared to QA. Further, we discovered that up until a certain\nparameter size threshold (3B), the performance of the FLAN-T5 model improves as\nthe parameter count increases. The robustness of different scales of FLAN-T5\nmodels to RE instruction is worse than the robustness to QA instruction.\n","authors":["Yuansheng Ni","Sichao Jiang","Xinyu wu","Hui Shen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14306v2.pdf","comment":"In our study, erroneous data analysis inadvertently led to misleading\n outcomes. Incorrect variables were included, distorting results. This\n emphasizes the significance of robust data processing and analysis techniques\n in research"},{"id":"http://arxiv.org/abs/2212.10003v2","updated":"2023-08-29T19:36:32Z","published":"2022-12-20T05:25:12Z","title":"(QA)$^2$: Question Answering with Questionable Assumptions","summary":" Naturally occurring information-seeking questions often contain questionable\nassumptions -- assumptions that are false or unverifiable. Questions containing\nquestionable assumptions are challenging because they require a distinct answer\nstrategy that deviates from typical answers for information-seeking questions.\nFor instance, the question \"When did Marie Curie discover Uranium?\" cannot be\nanswered as a typical \"when\" question without addressing the false assumption\n\"Marie Curie discovered Uranium\". In this work, we propose (QA)$^2$ (Question\nAnswering with Questionable Assumptions), an open-domain evaluation dataset\nconsisting of naturally occurring search engine queries that may or may not\ncontain questionable assumptions. To be successful on (QA)$^2$, systems must be\nable to detect questionable assumptions and also be able to produce adequate\nresponses for both typical information-seeking questions and ones with\nquestionable assumptions. Through human rater acceptability on end-to-end QA\nwith (QA)$^2$, we find that current models do struggle with handling\nquestionable assumptions, leaving substantial headroom for progress.\n","authors":["Najoung Kim","Phu Mon Htut","Samuel R. Bowman","Jackson Petty"],"pdf_url":"https://arxiv.org/pdf/2212.10003v2.pdf","comment":"ACL 2023 camera-ready"},{"id":"http://arxiv.org/abs/2308.13399v2","updated":"2023-08-29T18:28:13Z","published":"2023-08-25T14:23:40Z","title":"EntropyRank: Unsupervised Keyphrase Extraction via Side-Information\n Optimization for Language Model-based Text Compression","summary":" We propose an unsupervised method to extract keywords and keyphrases from\ntexts based on a pre-trained language model (LM) and Shannon's information\nmaximization. Specifically, our method extracts phrases having the highest\nconditional entropy under the LM. The resulting set of keyphrases turns out to\nsolve a relevant information-theoretic problem: if provided as side\ninformation, it leads to the expected minimal binary code length in compressing\nthe text using the LM and an entropy encoder. Alternately, the resulting set is\nan approximation via a causal LM to the set of phrases that minimize the\nentropy of the text when conditioned upon it. Empirically, the method provides\nresults comparable to the most commonly used methods in various keyphrase\nextraction benchmark challenges.\n","authors":["Alexander Tsvetkov","Alon Kipnis"],"pdf_url":"https://arxiv.org/pdf/2308.13399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15517v1","updated":"2023-08-29T16:58:03Z","published":"2023-08-29T16:58:03Z","title":"Document AI: A Comparative Study of Transformer-Based, Graph-Based\n Models, and Convolutional Neural Networks For Document Layout Analysis","summary":" Document AI aims to automatically analyze documents by leveraging natural\nlanguage processing and computer vision techniques. One of the major tasks of\nDocument AI is document layout analysis, which structures document pages by\ninterpreting the content and spatial relationships of layout, image, and text.\nThis task can be image-centric, wherein the aim is to identify and label\nvarious regions such as authors and paragraphs, or text-centric, where the\nfocus is on classifying individual words in a document. Although there are\nincreasingly sophisticated methods for improving layout analysis, doubts remain\nabout the extent to which their findings can be generalized to a broader\ncontext. Specifically, prior work developed systems based on very different\narchitectures, such as transformer-based, graph-based, and CNNs. However, no\nwork has mentioned the effectiveness of these models in a comparative analysis.\nMoreover, while language-independent Document AI models capable of knowledge\ntransfer have been developed, it remains to be investigated to what degree they\ncan effectively transfer knowledge. In this study, we aim to fill these gaps by\nconducting a comparative evaluation of state-of-the-art models in document\nlayout analysis and investigating the potential of cross-lingual layout\nanalysis by utilizing machine translation techniques.\n","authors":["Sotirios Kastanas","Shaomu Tan","Yi He"],"pdf_url":"https://arxiv.org/pdf/2308.15517v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.15479v1","updated":"2023-08-29T17:58:55Z","published":"2023-08-29T17:58:55Z","title":"3D Adversarial Augmentations for Robust Out-of-Domain Predictions","summary":" Since real-world training datasets cannot properly sample the long tail of\nthe underlying data distribution, corner cases and rare out-of-domain samples\ncan severely hinder the performance of state-of-the-art models. This problem\nbecomes even more severe for dense tasks, such as 3D semantic segmentation,\nwhere points of non-standard objects can be confidently associated to the wrong\nclass. In this work, we focus on improving the generalization to out-of-domain\ndata. We achieve this by augmenting the training set with adversarial examples.\nFirst, we learn a set of vectors that deform the objects in an adversarial\nfashion. To prevent the adversarial examples from being too far from the\nexisting data distribution, we preserve their plausibility through a series of\nconstraints, ensuring sensor-awareness and shapes smoothness. Then, we perform\nadversarial augmentation by applying the learned sample-independent vectors to\nthe available objects when training a model. We conduct extensive experiments\nacross a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D\nobject detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D\nsemantic segmentation. Despite training on a standard single dataset, our\napproach substantially improves the robustness and generalization of both 3D\nobject detection and 3D semantic segmentation methods to out-of-domain data.\n","authors":["Alexander Lehner","Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.15479v1.pdf","comment":"37 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.15478v1","updated":"2023-08-29T17:57:20Z","published":"2023-08-29T17:57:20Z","title":"An Adaptive Tangent Feature Perspective of Neural Networks","summary":" In order to better understand feature learning in neural networks, we propose\na framework for understanding linear models in tangent feature space where the\nfeatures are allowed to be transformed during training. We consider linear\ntransformations of features, resulting in a joint optimization over parameters\nand transformations with a bilinear interpolation constraint. We show that this\noptimization problem has an equivalent linearly constrained optimization with\nstructured regularization that encourages approximately low rank solutions.\nSpecializing to neural network structure, we gain insights into how the\nfeatures and thus the kernel function change, providing additional nuance to\nthe phenomenon of kernel alignment when the target function is poorly\nrepresented using tangent features. In addition to verifying our theoretical\nobservations in real neural networks on a simple regression problem, we\nempirically show that an adaptive feature implementation of tangent feature\nclassification has an order of magnitude lower sample complexity than the fixed\ntangent feature model on MNIST and CIFAR-10.\n","authors":["Daniel LeJeune","Sina Alemohammad"],"pdf_url":"https://arxiv.org/pdf/2308.15478v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.15474v1","updated":"2023-08-29T17:52:10Z","published":"2023-08-29T17:52:10Z","title":"A General-Purpose Self-Supervised Model for Computational Pathology","summary":" Tissue phenotyping is a fundamental computational pathology (CPath) task in\nlearning objective characterizations of histopathologic biomarkers in anatomic\npathology. However, whole-slide imaging (WSI) poses a complex computer vision\nproblem in which the large-scale image resolutions of WSIs and the enormous\ndiversity of morphological phenotypes preclude large-scale data annotation.\nCurrent efforts have proposed using pretrained image encoders with either\ntransfer learning from natural image datasets or self-supervised pretraining on\npublicly-available histopathology datasets, but have not been extensively\ndeveloped and evaluated across diverse tissue types at scale. We introduce UNI,\na general-purpose self-supervised model for pathology, pretrained using over\n100 million tissue patches from over 100,000 diagnostic haematoxylin and\neosin-stained WSIs across 20 major tissue types, and evaluated on 33\nrepresentative CPath clinical tasks in CPath of varying diagnostic\ndifficulties. In addition to outperforming previous state-of-the-art models, we\ndemonstrate new modeling capabilities in CPath such as resolution-agnostic\ntissue classification, slide classification using few-shot class prototypes,\nand disease subtyping generalization in classifying up to 108 cancer types in\nthe OncoTree code classification system. UNI advances unsupervised\nrepresentation learning at scale in CPath in terms of both pretraining data and\ndownstream evaluation, enabling data-efficient AI models that can generalize\nand transfer to a gamut of diagnostically-challenging tasks and clinical\nworkflows in anatomic pathology.\n","authors":["Richard J. Chen","Tong Ding","Ming Y. Lu","Drew F. K. Williamson","Guillaume Jaume","Bowen Chen","Andrew Zhang","Daniel Shao","Andrew H. Song","Muhammad Shaban","Mane Williams","Anurag Vaidya","Sharifa Sahai","Lukas Oldenburg","Luca L. Weishaupt","Judy J. Wang","Walt Williams","Long Phi Le","Georg Gerber","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2308.15474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15472v1","updated":"2023-08-29T17:51:22Z","published":"2023-08-29T17:51:22Z","title":"Learning Modulated Transformation in GANs","summary":" The success of style-based generators largely benefits from style modulation,\nwhich helps take care of the cross-instance variation within data. However, the\ninstance-wise stochasticity is typically introduced via regular convolution,\nwhere kernels interact with features at some fixed locations, limiting its\ncapacity for modeling geometric variation. To alleviate this problem, we equip\nthe generator in generative adversarial networks (GANs) with a plug-and-play\nmodule, termed as modulated transformation module (MTM). This module predicts\nspatial offsets under the control of latent codes, based on which the\nconvolution operation can be applied at variable locations for different\ninstances, and hence offers the model an additional degree of freedom to handle\ngeometry deformation. Extensive experiments suggest that our approach can be\nfaithfully generalized to various generative tasks, including image generation,\n3D-aware image synthesis, and video generation, and get compatible with\nstate-of-the-art frameworks without any hyper-parameter tuning. It is\nnoteworthy that, towards human generation on the challenging TaiChi dataset, we\nimprove the FID of StyleGAN3 from 21.36 to 13.60, demonstrating the efficacy of\nlearning modulated geometry transformation.\n","authors":["Ceyuan Yang","Qihang Zhang","Yinghao Xu","Jiapeng Zhu","Yujun Shen","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2308.15472v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2308.15469v1","updated":"2023-08-29T17:48:33Z","published":"2023-08-29T17:48:33Z","title":"Multimodal Contrastive Learning and Tabular Attention for Automated\n Alzheimer's Disease Prediction","summary":" Alongside neuroimaging such as MRI scans and PET, Alzheimer's disease (AD)\ndatasets contain valuable tabular data including AD biomarkers and clinical\nassessments. Existing computer vision approaches struggle to utilize this\nadditional information. To address these needs, we propose a generalizable\nframework for multimodal contrastive learning of image data and tabular data, a\nnovel tabular attention module for amplifying and ranking salient features in\ntables, and the application of these techniques onto Alzheimer's disease\nprediction. Experimental evaulations demonstrate the strength of our framework\nby detecting Alzheimer's disease (AD) from over 882 MR image slices from the\nADNI database. We take advantage of the high interpretability of tabular data\nand our novel tabular attention approach and through attribution of the\nattention scores for each row of the table, we note and rank the most\npredominant features. Results show that the model is capable of an accuracy of\nover 83.8%, almost a 10% increase from previous state of the art.\n","authors":["Weichen Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15466v1","updated":"2023-08-29T17:47:42Z","published":"2023-08-29T17:47:42Z","title":"Input margins can predict generalization too","summary":" Understanding generalization in deep neural networks is an active area of\nresearch. A promising avenue of exploration has been that of margin\nmeasurements: the shortest distance to the decision boundary for a given sample\nor its representation internal to the network. While margins have been shown to\nbe correlated with the generalization ability of a model when measured at its\nhidden representations (hidden margins), no such link between large margins and\ngeneralization has been established for input margins. We show that while input\nmargins are not generally predictive of generalization, they can be if the\nsearch space is appropriately constrained. We develop such a measure based on\ninput margins, which we refer to as `constrained margins'. The predictive power\nof this new measure is demonstrated on the 'Predicting Generalization in Deep\nLearning' (PGDL) dataset and contrasted with hidden representation margins. We\nfind that constrained margins achieve highly competitive scores and outperform\nother margin measurements in general. This provides a novel insight on the\nrelationship between generalization and classification margins, and highlights\nthe importance of considering the data manifold for investigations of\ngeneralization in DNNs.\n","authors":["Coenraad Mouton","Marthinus W. Theunissen","Marelie H. Davel"],"pdf_url":"https://arxiv.org/pdf/2308.15466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15462v1","updated":"2023-08-29T17:40:57Z","published":"2023-08-29T17:40:57Z","title":"Online Overexposed Pixels Hallucination in Videos with Adaptive\n Reference Frame Selection","summary":" Low dynamic range (LDR) cameras cannot deal with wide dynamic range inputs,\nfrequently leading to local overexposure issues. We present a learning-based\nsystem to reduce these artifacts without resorting to complex acquisition\nmechanisms like alternating exposures or costly processing that are typical of\nhigh dynamic range (HDR) imaging. We propose a transformer-based deep neural\nnetwork (DNN) to infer the missing HDR details. In an ablation study, we show\nthe importance of using a multiscale DNN and train it with the proper cost\nfunction to achieve state-of-the-art quality. To aid the reconstruction of the\noverexposed areas, our DNN takes a reference frame from the past as an\nadditional input. This leverages the commonly occurring temporal instabilities\nof autoexposure to our advantage: since well-exposed details in the current\nframe may be overexposed in the future, we use reinforcement learning to train\na reference frame selection DNN that decides whether to adopt the current frame\nas a future reference. Without resorting to alternating exposures, we obtain\ntherefore a causal, HDR hallucination algorithm with potential application in\ncommon video acquisition settings. Our demo video can be found at\nhttps://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view\n","authors":["Yazhou Xing","Amrita Mazumdar","Anjul Patney","Chao Liu","Hongxu Yin","Qifeng Chen","Jan Kautz","Iuri Frosio"],"pdf_url":"https://arxiv.org/pdf/2308.15462v1.pdf","comment":"The demo video can be found at\n https://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view"},{"id":"http://arxiv.org/abs/2301.13803v2","updated":"2023-08-29T17:38:45Z","published":"2023-01-31T17:44:59Z","title":"Fairness-aware Vision Transformer via Debiased Self-Attention","summary":" Vision Transformer (ViT) has recently gained significant interest in solving\ncomputer vision (CV) problems due to its capability of extracting informative\nfeatures and modeling long-range dependencies through the self-attention\nmechanism. To fully realize the advantages of ViT in real-world applications,\nrecent works have explored the trustworthiness of ViT, including its robustness\nand explainability. However, another desiderata, fairness has not yet been\nadequately addressed in the literature. We establish that the existing\nfairness-aware algorithms (primarily designed for CNNs) do not perform well on\nViT. This necessitates the need for developing our novel framework via Debiased\nSelf-Attention (DSA). DSA is a fairness-through-blindness approach that\nenforces ViT to eliminate spurious features correlated with the sensitive\nattributes for bias mitigation. Notably, adversarial examples are leveraged to\nlocate and mask the spurious features in the input image patches. In addition,\nDSA utilizes an attention weights alignment regularizer in the training\nobjective to encourage learning informative features for target prediction.\nImportantly, our DSA framework leads to improved fairness guarantees over prior\nworks on multiple prediction tasks without compromising target prediction\nperformance.\n","authors":["Yao Qiang","Chengyin Li","Prashant Khanduri","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2301.13803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15461v1","updated":"2023-08-29T17:38:33Z","published":"2023-08-29T17:38:33Z","title":"Canonical Factors for Hybrid Neural Fields","summary":" Factored feature volumes offer a simple way to build more compact, efficient,\nand intepretable neural fields, but also introduce biases that are not\nnecessarily beneficial for real-world data. In this work, we (1) characterize\nthe undesirable biases that these architectures have for axis-aligned signals\n-- they can lead to radiance field reconstruction differences of as high as 2\nPSNR -- and (2) explore how learning a set of canonicalizing transformations\ncan improve representations by removing these biases. We prove in a\ntwo-dimensional model problem that simultaneously learning these\ntransformations together with scene appearance succeeds with drastically\nimproved efficiency. We validate the resulting architectures, which we call\nTILTED, using image, signed distance, and radiance field reconstruction tasks,\nwhere we observe improvements across quality, robustness, compactness, and\nruntime. Results demonstrate that TILTED can enable capabilities comparable to\nbaselines that are 2x larger, while highlighting weaknesses of neural field\nevaluation procedures.\n","authors":["Brent Yi","Weijia Zeng","Sam Buchanan","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2308.15461v1.pdf","comment":"ICCV 2023. Project webpage: https://brentyi.github.io/tilted/"},{"id":"http://arxiv.org/abs/2308.15453v1","updated":"2023-08-29T17:23:33Z","published":"2023-08-29T17:23:33Z","title":"Pseudo-Boolean Polynomials Approach To Edge Detection And Image\n Segmentation","summary":" We introduce a deterministic approach to edge detection and image\nsegmentation by formulating pseudo-Boolean polynomials on image patches. The\napproach works by applying a binary classification of blob and edge regions in\nan image based on the degrees of pseudo-Boolean polynomials calculated on\npatches extracted from the provided image. We test our method on simple images\ncontaining primitive shapes of constant and contrasting colour and establish\nthe feasibility before applying it to complex instances like aerial landscape\nimages. The proposed method is based on the exploitation of the reduction,\npolynomial degree, and equivalence properties of penalty-based pseudo-Boolean\npolynomials.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin","Alexey Samosyuk"],"pdf_url":"https://arxiv.org/pdf/2308.15453v1.pdf","comment":"14 pages, 8 figures, submitted to the International Conference Data\n Analysis, Optimization and Their Applications on the Occasion of Boris\n Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region,\n Moscow Institute of Physics and Technology\n https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php"},{"id":"http://arxiv.org/abs/2112.09153v2","updated":"2023-08-29T17:04:19Z","published":"2021-12-16T19:00:55Z","title":"An Empirical Investigation of the Role of Pre-training in Lifelong\n Learning","summary":" The lifelong learning paradigm in machine learning is an attractive\nalternative to the more prominent isolated learning scheme not only due to its\nresemblance to biological learning but also its potential to reduce energy\nwaste by obviating excessive model re-training. A key challenge to this\nparadigm is the phenomenon of catastrophic forgetting. With the increasing\npopularity and success of pre-trained models in machine learning, we pose the\nquestion: What role does pre-training play in lifelong learning, specifically\nwith respect to catastrophic forgetting? We investigate existing methods in the\ncontext of large, pre-trained models and evaluate their performance on a\nvariety of text and image classification tasks, including a large-scale study\nusing a novel data set of 15 diverse NLP tasks. Across all settings, we observe\nthat generic pre-training implicitly alleviates the effects of catastrophic\nforgetting when learning multiple tasks sequentially compared to randomly\ninitialized models. We then further investigate why pre-training alleviates\nforgetting in this setting. We study this phenomenon by analyzing the loss\nlandscape, finding that pre-trained weights appear to ease forgetting by\nleading to wider minima. Based on this insight, we propose jointly optimizing\nfor current task loss and loss basin sharpness to explicitly encourage wider\nbasins during sequential fine-tuning. We show that this optimization approach\noutperforms several state-of-the-art task-sequential continual learning\nalgorithms across multiple settings, occasionally even without retaining a\nmemory that scales in size with the number of tasks.\n","authors":["Sanket Vaibhav Mehta","Darshan Patil","Sarath Chandar","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2112.09153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15427v1","updated":"2023-08-29T16:33:16Z","published":"2023-08-29T16:33:16Z","title":"Complementing Onboard Sensors with Satellite Map: A New Perspective for\n HD Map Construction","summary":" High-Definition (HD) maps play a crucial role in autonomous driving systems.\nRecent methods have attempted to construct HD maps in real-time based on\ninformation obtained from vehicle onboard sensors. However, the performance of\nthese methods is significantly susceptible to the environment surrounding the\nvehicle due to the inherent limitation of onboard sensors, such as weak\ncapacity for long-range detection. In this study, we demonstrate that\nsupplementing onboard sensors with satellite maps can enhance the performance\nof HD map construction methods, leveraging the broad coverage capability of\nsatellite maps. For the purpose of further research, we release the satellite\nmap tiles as a complementary dataset of nuScenes dataset. Meanwhile, we propose\na hierarchical fusion module that enables better fusion of satellite maps\ninformation with existing methods. Specifically, we design an attention mask\nbased on segmentation and distance, applying the cross-attention mechanism to\nfuse onboard Bird's Eye View (BEV) features and satellite features in\nfeature-level fusion. An alignment module is introduced before concatenation in\nBEV-level fusion to mitigate the impact of misalignment between the two\nfeatures. The experimental results on the augmented nuScenes dataset showcase\nthe seamless integration of our module into three existing HD map construction\nmethods. It notably enhances their performance in both HD map semantic\nsegmentation and instance detection tasks.\n","authors":["Wenjie Gao","Jiawei Fu","Haodong Jing","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.15427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15413v1","updated":"2023-08-29T16:13:04Z","published":"2023-08-29T16:13:04Z","title":"WrappingNet: Mesh Autoencoder via Deep Sphere Deformation","summary":" There have been recent efforts to learn more meaningful representations via\nfixed length codewords from mesh data, since a mesh serves as a complete model\nof underlying 3D shape compared to a point cloud. However, the mesh\nconnectivity presents new difficulties when constructing a deep learning\npipeline for meshes. Previous mesh unsupervised learning approaches typically\nassume category-specific templates, e.g., human face/body templates. It\nrestricts the learned latent codes to only be meaningful for objects in a\nspecific category, so the learned latent spaces are unable to be used across\ndifferent types of objects. In this work, we present WrappingNet, the first\nmesh autoencoder enabling general mesh unsupervised learning over heterogeneous\nobjects. It introduces a novel base graph in the bottleneck dedicated to\nrepresenting mesh connectivity, which is shown to facilitate learning a shared\nlatent space representing object shape. The superiority of WrappingNet mesh\nlearning is further demonstrated via improved reconstruction quality and\ncompetitive classification compared to point cloud learning, as well as latent\ninterpolation between meshes of different categories.\n","authors":["Eric Lei","Muhammad Asad Lodhi","Jiahao Pang","Junghyun Ahn","Dong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.15413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15405v1","updated":"2023-08-29T16:07:18Z","published":"2023-08-29T16:07:18Z","title":"Robust Long-Tailed Learning via Label-Aware Bounded CVaR","summary":" Data in the real-world classification problems are always imbalanced or\nlong-tailed, wherein the majority classes have the most of the samples that\ndominate the model training. In such setting, the naive model tends to have\npoor performance on the minority classes. Previously, a variety of loss\nmodifications have been proposed to address the long-tailed leaning problem,\nwhile these methods either treat the samples in the same class\nindiscriminatingly or lack a theoretical guarantee. In this paper, we propose\ntwo novel approaches based on CVaR (Conditional Value at Risk) to improve the\nperformance of long-tailed learning with a solid theoretical ground.\nSpecifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss\nto overcome the pessimistic result of the original CVaR, and further design the\noptimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we\nadditionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to\nstabilize the optimization process, where we also offer the theoretical\nsupport. Extensive experiments on real-world datasets with long-tailed label\ndistributions verify the superiority of our proposed methods.\n","authors":["Hong Zhu","Runpeng Yu","Xing Tang","Yifei Wang","Yuan Fang","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12896v2","updated":"2023-08-29T15:57:02Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v2.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.15397v1","updated":"2023-08-29T15:56:38Z","published":"2023-08-29T15:56:38Z","title":"Color Aesthetics: Fuzzy based User-driven Method for Harmony and\n Preference Prediction","summary":" Color is the most important intrinsic sensory feature that has a powerful\nimpact on product sales. Color is even responsible for raising the aesthetic\nsenses in our brains. Account for individual differences is crucial in color\naesthetics. It requires user-driven mechanisms for various e-commerce\napplications. We propose a method for quantitative evaluation of all types of\nperceptual responses to color(s): distinct color preference, color harmony, and\ncolor combination preference. Preference for color schemes can be predicted by\ncombining preferences for the basic colors and ratings of color harmony.\nHarmonious pallets are extracted from big data set using comparison algorithms\nbased on fuzzy similarity and grouping. The proposed model results in useful\npredictions of harmony and preference of multicolored images. For example, in\nthe context of apparel coordination, it allows predicting a preference for a\nlook based on clothing colors. Our approach differs from standard aesthetic\nmodels, since in accounts for a personal variation. In addition, it can process\nnot only lower-order color pairs, but also groups of several colors.\n","authors":["Pakizar Shamoi","Atsushi Inoue","Hiroharu Kawanaka"],"pdf_url":"https://arxiv.org/pdf/2308.15397v1.pdf","comment":"It was accepted as a short paper. IFSA-SCIS 2017 Conference held in\n Otsu, Japan"},{"id":"http://arxiv.org/abs/2308.15386v1","updated":"2023-08-29T15:29:06Z","published":"2023-08-29T15:29:06Z","title":"Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation\n and Diagnosis","summary":" Thyroid nodule segmentation is a crucial step in the diagnostic procedure of\nphysicians and computer-aided diagnosis systems. Mostly, current studies treat\nsegmentation and diagnosis as independent tasks without considering the\ncorrelation between these tasks. The sequence steps of these independent tasks\nin computer-aided diagnosis systems may lead to the accumulation of errors.\nTherefore, it is worth combining them as a whole through exploring the\nrelationship between thyroid nodule segmentation and diagnosis. According to\nthe thyroid imaging reporting and data system (TI-RADS), the assessment of\nshape and margin characteristics is the prerequisite for the discrimination of\nbenign and malignant thyroid nodules. These characteristics can be observed in\nthe thyroid nodule segmentation masks. Inspired by the diagnostic procedure of\nTI-RADS, this paper proposes a shape-margin knowledge augmented network\n(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to\nthe similarity in visual features between segmentation and diagnosis, SkaNet\nshares visual features in the feature extraction stage and then utilizes a\ndual-branch architecture to perform thyroid nodule segmentation and diagnosis\ntasks simultaneously. To enhance effective discriminative features, an\nexponential mixture module is devised, which incorporates convolutional feature\nmaps and self-attention maps by exponential weighting. Then, SkaNet is jointly\noptimized by a knowledge augmented multi-task loss function with a constraint\npenalty term. It embeds shape and margin characteristics through numerical\ncomputation and models the relationship between the thyroid nodule diagnosis\nresults and segmentation masks.\n","authors":["Weihua Liu","Chaochao Lin"],"pdf_url":"https://arxiv.org/pdf/2308.15386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00371v2","updated":"2023-08-29T15:25:30Z","published":"2023-07-01T15:48:33Z","title":"Learning Content-enhanced Mask Transformer for Domain Generalized\n Urban-Scene Segmentation","summary":" Domain-generalized urban-scene semantic segmentation (USSS) aims to learn\ngeneralized semantic predictions across diverse urban-scene styles. Unlike\ndomain gap challenges, USSS is unique in that the semantic categories are often\nsimilar in different urban scenes, while the styles can vary significantly due\nto changes in urban landscapes, weather conditions, lighting, and other\nfactors. Existing approaches typically rely on convolutional neural networks\n(CNNs) to learn the content of urban scenes.\n In this paper, we propose a Content-enhanced Mask TransFormer (CMFormer) for\ndomain-generalized USSS. The main idea is to enhance the focus of the\nfundamental component, the mask attention mechanism, in Transformer\nsegmentation models on content information. To achieve this, we introduce a\nnovel content-enhanced mask attention mechanism. It learns mask queries from\nboth the image feature and its down-sampled counterpart, as lower-resolution\nimage features usually contain more robust content information and are less\nsensitive to style variations. These features are fused into a Transformer\ndecoder and integrated into a multi-resolution content-enhanced mask attention\nlearning scheme.\n Extensive experiments conducted on various domain-generalized urban-scene\nsegmentation datasets demonstrate that the proposed CMFormer significantly\noutperforms existing CNN-based methods for domain-generalized semantic\nsegmentation, achieving improvements of up to 14.00\\% in terms of mIoU (mean\nintersection over union). The source code for CMFormer will be made available\nat this\n\\href{https://github.com/BiQiWHU/domain-generalized-urban-scene-segmentation}{repository}.\n","authors":["Qi Bi","Shaodi You","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2307.00371v2.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.15378v1","updated":"2023-08-29T15:16:51Z","published":"2023-08-29T15:16:51Z","title":"On the Robustness of Object Detection Models in Aerial Images","summary":" The robustness of object detection models is a major concern when applied to\nreal-world scenarios. However, the performance of most object detection models\ndegrades when applied to images subjected to corruptions, since they are\nusually trained and evaluated on clean datasets. Enhancing the robustness of\nobject detection models is of utmost importance, especially for those designed\nfor aerial images, which feature complex backgrounds, substantial variations in\nscales and orientations of objects. This paper addresses the challenge of\nassessing the robustness of object detection models in aerial images, with a\nspecific emphasis on scenarios where images are affected by clouds. In this\nstudy, we introduce two novel benchmarks based on DOTA-v1.0. The first\nbenchmark encompasses 19 prevalent corruptions, while the second focuses on\ncloud-corrupted images-a phenomenon uncommon in natural pictures yet frequent\nin aerial photography. We systematically evaluate the robustness of mainstream\nobject detection models and perform numerous ablation experiments. Through our\ninvestigations, we find that enhanced model architectures, larger networks,\nwell-crafted modules, and judicious data augmentation strategies collectively\nenhance the robustness of aerial object detection models. The benchmarks we\npropose and our comprehensive experimental analyses can facilitate research on\nrobust object detection in aerial images. Codes and datasets are available at:\n(https://github.com/hehaodong530/DOTA-C)\n","authors":["Haodong He","Jian Ding","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2308.15378v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.15367v1","updated":"2023-08-29T15:03:05Z","published":"2023-08-29T15:03:05Z","title":"Efficient Model Personalization in Federated Learning via\n Client-Specific Prompt Generation","summary":" Federated learning (FL) emerges as a decentralized learning framework which\ntrains models from multiple distributed clients without sharing their data to\npreserve privacy. Recently, large-scale pre-trained models (e.g., Vision\nTransformer) have shown a strong capability of deriving robust representations.\nHowever, the data heterogeneity among clients, the limited computation\nresources, and the communication bandwidth restrict the deployment of\nlarge-scale models in FL frameworks. To leverage robust representations from\nlarge-scale models while enabling efficient model personalization for\nheterogeneous clients, we propose a novel personalized FL framework of\nclient-specific Prompt Generation (pFedPG), which learns to deploy a\npersonalized prompt generator at the server for producing client-specific\nvisual prompts that efficiently adapts frozen backbones to local data\ndistributions. Our proposed framework jointly optimizes the stages of\npersonalized prompt adaptation locally and personalized prompt generation\nglobally. The former aims to train visual prompts that adapt foundation models\nto each client, while the latter observes local optimization directions to\ngenerate personalized prompts for all clients. Through extensive experiments on\nbenchmark datasets, we show that our pFedPG is favorable against\nstate-of-the-art personalized FL methods under various types of data\nheterogeneity, allowing computation and communication efficient model\npersonalization.\n","authors":["Fu-En Yang","Chien-Yi Wang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15367v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15366v1","updated":"2023-08-29T15:02:53Z","published":"2023-08-29T15:02:53Z","title":"AnomalyGPT: Detecting Industrial Anomalies using Large Vision-Language\n Models","summary":" Large Vision-Language Models (LVLMs) such as MiniGPT-4 and LLaVA have\ndemonstrated the capability of understanding images and achieved remarkable\nperformance in various visual tasks. Despite their strong abilities in\nrecognizing common objects due to extensive training datasets, they lack\nspecific domain knowledge and have a weaker understanding of localized details\nwithin objects, which hinders their effectiveness in the Industrial Anomaly\nDetection (IAD) task. On the other hand, most existing IAD methods only provide\nanomaly scores and necessitate the manual setting of thresholds to distinguish\nbetween normal and abnormal samples, which restricts their practical\nimplementation. In this paper, we explore the utilization of LVLM to address\nthe IAD problem and propose AnomalyGPT, a novel IAD approach based on LVLM. We\ngenerate training data by simulating anomalous images and producing\ncorresponding textual descriptions for each image. We also employ an image\ndecoder to provide fine-grained semantic and design a prompt learner to\nfine-tune the LVLM using prompt embeddings. Our AnomalyGPT eliminates the need\nfor manual threshold adjustments, thus directly assesses the presence and\nlocations of anomalies. Additionally, AnomalyGPT supports multi-turn dialogues\nand exhibits impressive few-shot in-context learning capabilities. With only\none normal shot, AnomalyGPT achieves the state-of-the-art performance with an\naccuracy of 86.1%, an image-level AUC of 94.1%, and a pixel-level AUC of 95.3%\non the MVTec-AD dataset. Code is available at\nhttps://github.com/CASIA-IVA-Lab/AnomalyGPT.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10720v2","updated":"2023-08-29T14:59:28Z","published":"2023-06-19T06:41:19Z","title":"Exploring the Relationship between Samples and Masks for Robust Defect\n Localization","summary":" Defect detection aims to detect and localize regions out of the normal\ndistribution.Previous approaches model normality and compare it with the input\nto identify defective regions, potentially limiting their generalizability.This\npaper proposes a one-stage framework that detects defective patterns directly\nwithout the modeling process.This ability is adopted through the joint efforts\nof three parties: a generative adversarial network (GAN), a newly proposed\nscaled pattern loss, and a dynamic masked cycle-consistent auxiliary network.\nExplicit information that could indicate the position of defects is\nintentionally excluded to avoid learning any direct mapping.Experimental\nresults on the texture class of the challenging MVTec AD dataset show that the\nproposed method is 2.9\\% higher than the SOTA methods in F1-Score, while\nsubstantially outperforming SOTA methods in generalizability.\n","authors":["Jiang Lin","Yaping yan"],"pdf_url":"https://arxiv.org/pdf/2306.10720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15357v1","updated":"2023-08-29T14:53:16Z","published":"2023-08-29T14:53:16Z","title":"Ego-Motion Estimation and Dynamic Motion Separation from 3D Point Clouds\n for Accumulating Data and Improving 3D Object Detection","summary":" New 3+1D high-resolution radar sensors are gaining importance for 3D object\ndetection in the automotive domain due to their relative affordability and\nimproved detection compared to classic low-resolution radar sensors. One\nlimitation of high-resolution radar sensors, compared to lidar sensors, is the\nsparsity of the generated point cloud. This sparsity could be partially\novercome by accumulating radar point clouds of subsequent time steps. This\ncontribution analyzes limitations of accumulating radar point clouds on the\nView-of-Delft dataset. By employing different ego-motion estimation approaches,\nthe dataset's inherent constraints, and possible solutions are analyzed.\nAdditionally, a learning-based instance motion estimation approach is deployed\nto investigate the influence of dynamic motion on the accumulated point cloud\nfor object detection. Experiments document an improved object detection\nperformance by applying an ego-motion estimation and dynamic motion correction\napproach.\n","authors":["Patrick Palmer","Martin Krueger","Richard Altendorfer","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2308.15357v1.pdf","comment":"Published at: AmE 2023 - Automotive meets Electronics; 14. GMM\n Symposium (https://ieeexplore.ieee.org/document/10227711)"},{"id":"http://arxiv.org/abs/2307.12676v5","updated":"2023-08-29T14:48:37Z","published":"2023-07-24T10:30:54Z","title":"Few-shot $\\mathbf{1/a}$ Anomalies Feedback : Damage Vision Mining\n Opportunity and Embedding Feature Imbalance","summary":" Over the past decade, previous balanced datasets have been used to advance\ndeep learning algorithms for industrial applications. In urban infrastructures\nand living environments, damage data mining cannot avoid imbalanced data issues\nbecause of rare unseen events and the high-quality status of improved\noperations. For visual inspection, the deteriorated class acquired from the\nsurface of concrete and steel components are occasionally imbalanced. From\nnumerous related surveys, we conclude that imbalanced data problems can be\ncategorised into four types: 1) missing range of target and label valuables, 2)\nmajority-minority class imbalance, 3) foreground background of spatial\nimbalance, and 4) long-tailed class of pixel-wise imbalance. Since 2015, many\nimbalanced studies have been conducted using deep-learning approaches,\nincluding regression, image classification, object detection, and semantic\nsegmentation. However, anomaly detection for imbalanced data is not well known.\nIn this study, we highlight a one-class anomaly detection application, whether\nanomalous class or not, and demonstrate clear examples of imbalanced vision\ndatasets: medical disease, hazardous behaviour, material deterioration, plant\ndisease, river sludge, and disaster damage. We provide key results on the\nadvantage of damage-vision mining, hypothesising that the more effective the\nrange of the positive ratio, the higher the accuracy gain of the anomalies\nfeedback. In our imbalanced studies, compared with the balanced case with a\npositive ratio of $1/1$, we find that there is an applicable positive ratio\n$1/a$ where the accuracy is consistently high. However, the extremely\nimbalanced range is from one shot to $1/2a$, the accuracy of which is inferior\nto that of the applicable ratio. In contrast, with a positive ratio ranging\nover $2/a$, it shifts in the over-mining phase without an effective gain in\naccuracy.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v5.pdf","comment":"34 pages, 53 figures, 28 tables"},{"id":"http://arxiv.org/abs/2308.15353v1","updated":"2023-08-29T14:48:29Z","published":"2023-08-29T14:48:29Z","title":"Detect, Augment, Compose, and Adapt: Four Steps for Unsupervised Domain\n Adaptation in Object Detection","summary":" Unsupervised domain adaptation (UDA) plays a crucial role in object detection\nwhen adapting a source-trained detector to a target domain without annotated\ndata. In this paper, we propose a novel and effective four-step UDA approach\nthat leverages self-supervision and trains source and target data concurrently.\nWe harness self-supervised learning to mitigate the lack of ground truth in the\ntarget domain. Our method consists of the following steps: (1) identify the\nregion with the highest-confidence set of detections in each target image,\nwhich serve as our pseudo-labels; (2) crop the identified region and generate a\ncollection of its augmented versions; (3) combine these latter into a composite\nimage; (4) adapt the network to the target domain using the composed image.\nThrough extensive experiments under cross-camera, cross-weather, and\nsynthetic-to-real scenarios, our approach achieves state-of-the-art\nperformance, improving upon the nearest competitor by more than 2% in terms of\nmean Average Precision (mAP). The code is available at\nhttps://github.com/MohamedTEV/DACA.\n","authors":["Mohamed L. Mekhalfi","Davide Boscaini","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2308.15353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15346v1","updated":"2023-08-29T14:41:40Z","published":"2023-08-29T14:41:40Z","title":"Enhancing Mobile Face Anti-Spoofing: A Robust Framework for Diverse\n Attack Types under Screen Flash","summary":" Face anti-spoofing (FAS) is crucial for securing face recognition systems.\nHowever, existing FAS methods with handcrafted binary or pixel-wise labels have\nlimitations due to diverse presentation attacks (PAs). In this paper, we\npropose an attack type robust face anti-spoofing framework under light flash,\ncalled ATR-FAS. Due to imaging differences caused by various attack types,\ntraditional FAS methods based on single binary classification network may\nresult in excessive intra-class distance of spoof faces, leading to a challenge\nof decision boundary learning. Therefore, we employed multiple networks to\nreconstruct multi-frame depth maps as auxiliary supervision, and each network\nexperts in one type of attack. A dual gate module (DGM) consisting of a type\ngate and a frame-attention gate is introduced, which perform attack type\nrecognition and multi-frame attention generation, respectively. The outputs of\nDGM are utilized as weight to mix the result of multiple expert networks. The\nmulti-experts mixture enables ATR-FAS to generate spoof-differentiated depth\nmaps, and stably detects spoof faces without being affected by different types\nof PAs. Moreover, we design a differential normalization procedure to convert\noriginal flash frames into differential frames. This simple but effective\nprocessing enhances the details in flash frames, aiding in the generation of\ndepth maps. To verify the effectiveness of our framework, we collected a\nlarge-scale dataset containing 12,660 live and spoof videos with diverse PAs\nunder dynamic flash from the smartphone screen. Extensive experiments\nillustrate that the proposed ATR-FAS significantly outperforms existing\nstate-of-the-art methods. The code and dataset will be available at\nhttps://github.com/Chaochao-Lin/ATR-FAS.\n","authors":["Weihua Liu","Chaochao Lin","Yu Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15345v1","updated":"2023-08-29T14:41:10Z","published":"2023-08-29T14:41:10Z","title":"IndGIC: Supervised Action Recognition under Low Illumination","summary":" Technologies of human action recognition in the dark are gaining more and\nmore attention as huge demand in surveillance, motion control and\nhuman-computer interaction. However, because of limitation in image enhancement\nmethod and low-lighting video datasets, e.g. labeling cost, existing methods\nmeet some problems. Some video-based approached are effect and efficient in\nspecific datasets but cannot generalize to most cases while others methods\nusing multiple sensors rely heavily to prior knowledge to deal with noisy\nnature from video stream. In this paper, we proposes action recognition method\nusing deep multi-input network. Furthermore, we proposed a Independent Gamma\nIntensity Corretion (Ind-GIC) to enhance poor-illumination video, generating\none gamma for one frame to increase enhancement performance. To prove our\nmethod is effective, there is some evaluation and comparison between our method\nand existing methods. Experimental results show that our model achieves high\naccuracy in on ARID dataset.\n","authors":["Jingbo Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.15345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15344v1","updated":"2023-08-29T14:41:05Z","published":"2023-08-29T14:41:05Z","title":"Imperceptible Adversarial Attack on Deep Neural Networks from Image\n Boundary","summary":" Although Deep Neural Networks (DNNs), such as the convolutional neural\nnetworks (CNN) and Vision Transformers (ViTs), have been successfully applied\nin the field of computer vision, they are demonstrated to be vulnerable to\nwell-sought Adversarial Examples (AEs) that can easily fool the DNNs. The\nresearch in AEs has been active, and many adversarial attacks and explanations\nhave been proposed since they were discovered in 2014. The mystery of the AE's\nexistence is still an open question, and many studies suggest that DNN training\nalgorithms have blind spots. The salient objects usually do not overlap with\nboundaries; hence, the boundaries are not the DNN model's attention.\nNevertheless, recent studies show that the boundaries can dominate the behavior\nof the DNN models. Hence, this study aims to look at the AEs from a different\nperspective and proposes an imperceptible adversarial attack that systemically\nattacks the input image boundary for finding the AEs. The experimental results\nhave shown that the proposed boundary attacking method effectively attacks six\nCNN models and the ViT using only 32% of the input image content (from the\nboundaries) with an average success rate (SR) of 95.2% and an average peak\nsignal-to-noise ratio of 41.37 dB. Correlation analyses are conducted,\nincluding the relation between the adversarial boundary's width and the SR and\nhow the adversarial boundary changes the DNN model's attention. This paper's\ndiscoveries can potentially advance the understanding of AEs and provide a\ndifferent perspective on how AEs can be constructed.\n","authors":["Fahad Alrasheedi","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.15344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15327v1","updated":"2023-08-29T14:23:44Z","published":"2023-08-29T14:23:44Z","title":"Enhancing Robot Learning through Learned Human-Attention Feature Maps","summary":" Robust and efficient learning remains a challenging problem in robotics, in\nparticular with complex visual inputs. Inspired by human attention mechanism,\nwith which we quickly process complex visual scenes and react to changes in the\nenvironment, we think that embedding auxiliary information about focus point\ninto robot learning would enhance efficiency and robustness of the learning\nprocess. In this paper, we propose a novel approach to model and emulate the\nhuman attention with an approximate prediction model. We then leverage this\noutput and feed it as a structured auxiliary feature map into downstream\nlearning tasks. We validate this idea by learning a prediction model from\nhuman-gaze recordings of manual driving in the real world. We test our approach\non two learning tasks - object detection and imitation learning. Our\nexperiments demonstrate that the inclusion of predicted human attention leads\nto improved robustness of the trained models to out-of-distribution samples and\nfaster learning in low-data regime settings. Our work highlights the potential\nof incorporating structured auxiliary information in representation learning\nfor robotics and opens up new avenues for research in this direction. All code\nand data are available online.\n","authors":["Daniel Scheuchenstuhl","Stefan Ulmer","Felix Resch","Luigi Berducci","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2308.15327v1.pdf","comment":"This work has been accepted for the RAP4Robots workshop at ICRA 2023\n in London"},{"id":"http://arxiv.org/abs/2308.15323v1","updated":"2023-08-29T14:20:13Z","published":"2023-08-29T14:20:13Z","title":"Occlusion-Aware Deep Convolutional Neural Network via Homogeneous\n Tanh-transforms for Face Parsing","summary":" Face parsing infers a pixel-wise label map for each semantic facial\ncomponent. Previous methods generally work well for uncovered faces, however\noverlook the facial occlusion and ignore some contextual area outside a single\nface, especially when facial occlusion has become a common situation during the\nCOVID-19 epidemic. Inspired by the illumination theory of image, we propose a\nnovel homogeneous tanh-transforms for image preprocessing, which made up of\nfour tanh-transforms, that fuse the central vision and the peripheral vision\ntogether. Our proposed method addresses the dilemma of face parsing under\nocclusion and compresses more information of surrounding context. Based on\nhomogeneous tanh-transforms, we propose an occlusion-aware convolutional neural\nnetwork for occluded face parsing. It combines the information both in\nTanh-polar space and Tanh-Cartesian space, capable of enhancing receptive\nfields. Furthermore, we introduce an occlusion-aware loss to focus on the\nboundaries of occluded regions. The network is simple and flexible, and can be\ntrained end-to-end. To facilitate future research of occluded face parsing, we\nalso contribute a new cleaned face parsing dataset, which is manually purified\nfrom several academic or industrial datasets, including CelebAMask-HQ,\nShort-video Face Parsing as well as Helen dataset and will make it public.\nExperiments demonstrate that our method surpasses state-of-art methods of face\nparsing under occlusion.\n","authors":["Weihua Liu","Chaochao Lin","Haoping Yu","Said Boumaraf","Zhaoqiong Pi"],"pdf_url":"https://arxiv.org/pdf/2308.15323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v1","updated":"2023-08-29T14:16:09Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v1.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2303.09790v4","updated":"2023-08-29T14:16:04Z","published":"2023-03-17T06:18:16Z","title":"Reliable Multimodality Eye Disease Screening via Mixture of Student's t\n Distributions","summary":" Multimodality eye disease screening is crucial in ophthalmology as it\nintegrates information from diverse sources to complement their respective\nperformances. However, the existing methods are weak in assessing the\nreliability of each unimodality, and directly fusing an unreliable modality may\ncause screening errors. To address this issue, we introduce a novel\nmultimodality evidential fusion pipeline for eye disease screening, EyeMoSt,\nwhich provides a measure of confidence for unimodality and elegantly integrates\nthe multimodality information from a multi-distribution fusion perspective.\nSpecifically, our model estimates both local uncertainty for unimodality and\nglobal uncertainty for the fusion modality to produce reliable classification\nresults. More importantly, the proposed mixture of Student's $t$ distributions\nadaptively integrates different modalities to endow the model with heavy-tailed\nproperties, increasing robustness and reliability. Our experimental findings on\nboth public and in-house datasets show that our model is more reliable than\ncurrent methods. Additionally, EyeMost has the potential ability to serve as a\ndata quality discriminator, enabling reliable decision-making for multimodality\neye disease screening.\n","authors":["Ke Zou","Tian Lin","Xuedong Yuan","Haoyu Chen","Xiaojing Shen","Meng Wang","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2303.09790v4.pdf","comment":"MICCAI 2023 (Early accept):11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.15316v1","updated":"2023-08-29T14:02:27Z","published":"2023-08-29T14:02:27Z","title":"3D-MuPPET: 3D Multi-Pigeon Pose Estimation and Tracking","summary":" Markerless methods for animal posture tracking have been developing recently,\nbut frameworks and benchmarks for tracking large animal groups in 3D are still\nlacking. To overcome this gap in the literature, we present 3D-MuPPET, a\nframework to estimate and track 3D poses of up to 10 pigeons at interactive\nspeed using multiple-views. We train a pose estimator to infer 2D keypoints and\nbounding boxes of multiple pigeons, then triangulate the keypoints to 3D. For\ncorrespondence matching, we first dynamically match 2D detections to global\nidentities in the first frame, then use a 2D tracker to maintain\ncorrespondences accross views in subsequent frames. We achieve comparable\naccuracy to a state of the art 3D pose estimator for Root Mean Square Error\n(RMSE) and Percentage of Correct Keypoints (PCK). We also showcase a novel use\ncase where our model trained with data of single pigeons provides comparable\nresults on data containing multiple pigeons. This can simplify the domain shift\nto new species because annotating single animal data is less labour intensive\nthan multi-animal data. Additionally, we benchmark the inference speed of\n3D-MuPPET, with up to 10 fps in 2D and 1.5 fps in 3D, and perform quantitative\ntracking evaluation, which yields encouraging results. Finally, we show that\n3D-MuPPET also works in natural environments without model fine-tuning on\nadditional annotations. To the best of our knowledge we are the first to\npresent a framework for 2D/3D posture and trajectory tracking that works in\nboth indoor and outdoor environments.\n","authors":["Urs Waldmann","Alex Hoi Hang Chan","Hemal Naik","Máté Nagy","Iain D. Couzin","Oliver Deussen","Bastian Goldluecke","Fumihiro Kano"],"pdf_url":"https://arxiv.org/pdf/2308.15316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15313v1","updated":"2023-08-29T14:00:55Z","published":"2023-08-29T14:00:55Z","title":"Spatio-temporal MLP-graph network for 3D human pose estimation","summary":" Graph convolutional networks and their variants have shown significant\npromise in 3D human pose estimation. Despite their success, most of these\nmethods only consider spatial correlations between body joints and do not take\ninto account temporal correlations, thereby limiting their ability to capture\nrelationships in the presence of occlusions and inherent ambiguity. To address\nthis potential weakness, we propose a spatio-temporal network architecture\ncomposed of a joint-mixing multi-layer perceptron block that facilitates\ncommunication among different joints and a graph weighted Jacobi network block\nthat enables communication among various feature channels. The major novelty of\nour approach lies in a new weighted Jacobi feature propagation rule obtained\nthrough graph filtering with implicit fairing. We leverage temporal information\nfrom the 2D pose sequences, and integrate weight modulation into the model to\nenable untangling of the feature transformations of distinct nodes. We also\nemploy adjacency modulation with the aim of learning meaningful correlations\nbeyond defined linkages between body joints by altering the graph topology\nthrough a learnable modulation matrix. Extensive experiments on two benchmark\ndatasets demonstrate the effectiveness of our model, outperforming recent\nstate-of-the-art methods for 3D human pose estimation.\n","authors":["Tanvir Hassan","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2308.15313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03981v3","updated":"2023-08-29T13:50:43Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n Identification","summary":" Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07494v2","updated":"2023-08-29T13:43:37Z","published":"2023-07-14T17:27:22Z","title":"TALL: Thumbnail Layout for Deepfake Video Detection","summary":" The growing threats of deepfakes to society and cybersecurity have raised\nenormous public concerns, and increasing efforts have been devoted to this\ncritical topic of deepfake video detection. Existing video methods achieve good\nperformance but are computationally intensive. This paper introduces a simple\nyet effective strategy named Thumbnail Layout (TALL), which transforms a video\nclip into a pre-defined layout to realize the preservation of spatial and\ntemporal dependencies. Specifically, consecutive frames are masked in a fixed\nposition in each frame to improve generalization, then resized to sub-images\nand rearranged into a pre-defined layout as the thumbnail. TALL is\nmodel-agnostic and extremely simple by only modifying a few lines of code.\nInspired by the success of vision transformers, we incorporate TALL into Swin\nTransformer, forming an efficient and effective method TALL-Swin. Extensive\nexperiments on intra-dataset and cross-dataset validate the validity and\nsuperiority of TALL and SOTA TALL-Swin. TALL-Swin achieves 90.79$\\%$ AUC on the\nchallenging cross-dataset task, FaceForensics++ $\\to$ Celeb-DF. The code is\navailable at https://github.com/rainy-xu/TALL4Deepfake.\n","authors":["Yuting Xu","Jian Liang","Gengyun Jia","Ziming Yang","Yanhao Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2307.07494v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2304.04902v2","updated":"2023-08-29T13:42:27Z","published":"2023-04-11T00:17:34Z","title":"Weakly Supervised Intracranial Hemorrhage Segmentation using Head-Wise\n Gradient-Infused Self-Attention Maps from a Swin Transformer in Categorical\n Learning","summary":" Intracranial hemorrhage (ICH) is a life-threatening medical emergency that\nrequires timely and accurate diagnosis for effective treatment and improved\npatient survival rates. While deep learning techniques have emerged as the\nleading approach for medical image analysis and processing, the most commonly\nemployed supervised learning often requires large, high-quality annotated\ndatasets that can be costly to obtain, particularly for pixel/voxel-wise image\nsegmentation. To address this challenge and facilitate ICH treatment decisions,\nwe introduce a novel weakly supervised method for ICH segmentation, utilizing a\nSwin transformer trained on an ICH classification task with categorical labels.\nOur approach leverages a hierarchical combination of head-wise gradient-infused\nself-attention maps to generate accurate image segmentation. Additionally, we\nconducted an exploratory study on different learning strategies and showed that\nbinary ICH classification has a more positive impact on self-attention maps\ncompared to full ICH subtyping. With a mean Dice score of 0.44, our technique\nachieved similar ICH segmentation performance as the popular U-Net and\nSwin-UNETR models with full supervision and outperformed a similar weakly\nsupervised approach using GradCAM, demonstrating the excellent potential of the\nproposed framework in challenging medical image segmentation tasks. Our code is\navailable at https://github.com/HealthX-Lab/HGI-SAM.\n","authors":["Amirhossein Rasoulian","Soorena Salari","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2304.04902v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2023:012"},{"id":"http://arxiv.org/abs/2308.15300v1","updated":"2023-08-29T13:38:35Z","published":"2023-08-29T13:38:35Z","title":"MSFlow: Multi-Scale Flow-based Framework for Unsupervised Anomaly\n Detection","summary":" Unsupervised anomaly detection (UAD) attracts a lot of research interest and\ndrives widespread applications, where only anomaly-free samples are available\nfor training. Some UAD applications intend to further locate the anomalous\nregions without any anomaly information.\n Although the absence of anomalous samples and annotations deteriorates the\nUAD performance, an inconspicuous yet powerful statistics model, the\nnormalizing flows, is appropriate for anomaly detection and localization in an\nunsupervised fashion. The flow-based probabilistic models, only trained on\nanomaly-free data, can efficiently distinguish unpredictable anomalies by\nassigning them much lower likelihoods than normal data.\n Nevertheless, the size variation of unpredictable anomalies introduces\nanother inconvenience to the flow-based methods for high-precision anomaly\ndetection and localization. To generalize the anomaly size variation, we\npropose a novel Multi-Scale Flow-based framework dubbed MSFlow composed of\nasymmetrical parallel flows followed by a fusion flow to exchange multi-scale\nperceptions. Moreover, different multi-scale aggregation strategies are adopted\nfor image-wise anomaly detection and pixel-wise anomaly localization according\nto the discrepancy between them. The proposed MSFlow is evaluated on three\nanomaly detection datasets, significantly outperforming existing methods.\nNotably, on the challenging MVTec AD benchmark, our MSFlow achieves a new\nstate-of-the-art with a detection AUORC score of up to 99.7%, localization\nAUCROC score of 98.8%, and PRO score of 97.1%. The reproducible code is\navailable at https://github.com/cool-xuan/msflow.\n","authors":["Yixuan Zhou","Xing Xu","Jingkuan Song","Fumin Shen","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.15300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15284v1","updated":"2023-08-29T13:15:13Z","published":"2023-08-29T13:15:13Z","title":"ARTxAI: Explainable Artificial Intelligence Curates Deep Representation\n Learning for Artistic Images using Fuzzy Techniques","summary":" Automatic art analysis employs different image processing techniques to\nclassify and categorize works of art. When working with artistic images, we\nneed to take into account further considerations compared to classical image\nprocessing. This is because such artistic paintings change drastically\ndepending on the author, the scene depicted, and their artistic style. This can\nresult in features that perform very well in a given task but do not grasp the\nwhole of the visual and symbolic information contained in a painting. In this\npaper, we show how the features obtained from different tasks in artistic image\nclassification are suitable to solve other ones of similar nature. We present\ndifferent methods to improve the generalization capabilities and performance of\nartistic classification systems. Furthermore, we propose an explainable\nartificial intelligence method to map known visual traits of an image with the\nfeatures used by the deep learning model considering fuzzy rules. These rules\nshow the patterns and variables that are relevant to solve each task and how\neffective is each of the patterns found. Our results show that our proposed\ncontext-aware features can achieve up to $6\\%$ and $26\\%$ more accurate results\nthan other context- and non-context-aware solutions, respectively, depending on\nthe specific task. We also show that some of the features used by these models\ncan be more clearly correlated to visual traits in the original image than\nothers.\n","authors":["Javier Fumanal-Idocin","Javier Andreu-Perez","Oscar Cordón","Hani Hagras","Humberto Bustince"],"pdf_url":"https://arxiv.org/pdf/2308.15284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00135v4","updated":"2023-08-29T13:10:56Z","published":"2022-12-31T06:32:36Z","title":"TeViS:Translating Text Synopses to Video Storyboards","summary":" A video storyboard is a roadmap for video creation which consists of\nshot-by-shot images to visualize key plots in a text synopsis. Creating video\nstoryboards, however, remains challenging which not only requires cross-modal\nassociation between high-level texts and images but also demands long-term\nreasoning to make transitions smooth across shots. In this paper, we propose a\nnew task called Text synopsis to Video Storyboard (TeViS) which aims to\nretrieve an ordered sequence of images as the video storyboard to visualize the\ntext synopsis. We construct a MovieNet-TeViS dataset based on the public\nMovieNet dataset. It contains 10K text synopses each paired with keyframes\nmanually selected from corresponding movies by considering both relevance and\ncinematic coherence. To benchmark the task, we present strong CLIP-based\nbaselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images\ninto a joint embedding space and uses vector quantization (VQ) to improve the\nvisual representation. Then, it auto-regressively generates a sequence of\nvisual features for retrieval and ordering. Experimental results demonstrate\nthat VQ-Trans significantly outperforms prior methods and the CLIP-based\nbaselines. Nevertheless, there is still a large gap compared to human\nperformance suggesting room for promising future work. The code and data are\navailable at: \\url{https://ruc-aimind.github.io/projects/TeViS/}\n","authors":["Xu Gu","Yuchong Sun","Feiyue Ni","Shizhe Chen","Xihua Wang","Ruihua Song","Boyuan Li","Xiang Cao"],"pdf_url":"https://arxiv.org/pdf/2301.00135v4.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.15280v1","updated":"2023-08-29T13:10:53Z","published":"2023-08-29T13:10:53Z","title":"ADFA: Attention-augmented Differentiable top-k Feature Adaptation for\n Unsupervised Medical Anomaly Detection","summary":" The scarcity of annotated data, particularly for rare diseases, limits the\nvariability of training data and the range of detectable lesions, presenting a\nsignificant challenge for supervised anomaly detection in medical imaging. To\nsolve this problem, we propose a novel unsupervised method for medical image\nanomaly detection: Attention-Augmented Differentiable top-k Feature Adaptation\n(ADFA). The method utilizes Wide-ResNet50-2 (WR50) network pre-trained on\nImageNet to extract initial feature representations. To reduce the channel\ndimensionality while preserving relevant channel information, we employ an\nattention-augmented patch descriptor on the extracted features. We then apply\ndifferentiable top-k feature adaptation to train the patch descriptor, mapping\nthe extracted feature representations to a new vector space, enabling effective\ndetection of anomalies. Experiments show that ADFA outperforms state-of-the-art\n(SOTA) methods on multiple challenging medical image datasets, confirming its\neffectiveness in medical anomaly detection.\n","authors":["Yiming Huang","Guole Liu","Yaoru Luo","Ge Yang"],"pdf_url":"https://arxiv.org/pdf/2308.15280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15273v1","updated":"2023-08-29T13:02:35Z","published":"2023-08-29T13:02:35Z","title":"Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification\n with Cross-Modal Retrieval","summary":" Contrastive language-image pre-training (CLIP) has demonstrated remarkable\nzero-shot classification ability, namely image classification using novel text\nlabels. Existing works have attempted to enhance CLIP by fine-tuning on\ndownstream tasks, but these have inadvertently led to performance degradation\non unseen classes, thus harming zero-shot generalization. This paper aims to\naddress this challenge by leveraging readily available image-text pairs from an\nexternal dataset for cross-modal guidance during inference. To this end, we\npropose X-MoRe, a novel inference method comprising two key steps: (1)\ncross-modal retrieval and (2) modal-confidence-based ensemble. Given a query\nimage, we harness the power of CLIP's cross-modal representations to retrieve\nrelevant textual information from an external image-text pair dataset. Then, we\nassign higher weights to the more reliable modality between the original query\nimage and retrieved text, contributing to the final prediction. X-MoRe\ndemonstrates robust performance across a diverse set of tasks without the need\nfor additional training, showcasing the effectiveness of utilizing cross-modal\nfeatures to maximize CLIP's zero-shot ability.\n","authors":["Seongha Eom","Namgyu Ho","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.15273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07439v2","updated":"2023-08-29T12:57:17Z","published":"2023-07-14T16:04:03Z","title":"Atlas-Based Interpretable Age Prediction In Whole-Body MR Images","summary":" Age prediction is an important part of medical assessments and research. It\ncan aid in detecting diseases as well as abnormal ageing by highlighting the\ndiscrepancy between chronological and biological age. To gain a comprehensive\nunderstanding of age-related changes observed in various body parts, we\ninvestigate them on a larger scale by using whole-body images. We utilise the\nGrad-CAM interpretability method to determine the body areas most predictive of\na person's age. We expand our analysis beyond individual subjects by employing\nregistration techniques to generate population-wide interpretability maps.\nFurthermore, we set state-of-the-art whole-body age prediction with a model\nthat achieves a mean absolute error of 2.76 years. Our findings reveal three\nprimary areas of interest: the spine, the autochthonous back muscles, and the\ncardiac region, which exhibits the highest importance.\n","authors":["Sophie Starck","Yadunandan Vivekanand Kini","Jessica Johanna Maria Ritter","Rickmer Braren","Daniel Rueckert","Tamara Mueller"],"pdf_url":"https://arxiv.org/pdf/2307.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15266v1","updated":"2023-08-29T12:51:04Z","published":"2023-08-29T12:51:04Z","title":"NOVIS: A Case for End-to-End Near-Online Video Instance Segmentation","summary":" Until recently, the Video Instance Segmentation (VIS) community operated\nunder the common belief that offline methods are generally superior to a frame\nby frame online processing. However, the recent success of online methods\nquestions this belief, in particular, for challenging and long video sequences.\nWe understand this work as a rebuttal of those recent observations and an\nappeal to the community to focus on dedicated near-online VIS approaches. To\nsupport our argument, we present a detailed analysis on different processing\nparadigms and the new end-to-end trainable NOVIS (Near-Online Video Instance\nSegmentation) method. Our transformer-based model directly predicts\nspatio-temporal mask volumes for clips of frames and performs instance tracking\nbetween clips via overlap embeddings. NOVIS represents the first near-online\nVIS approach which avoids any handcrafted tracking heuristics. We outperform\nall existing VIS methods by large margins and provide new state-of-the-art\nresults on both YouTube-VIS (2019/2021) and the OVIS benchmarks.\n","authors":["Tim Meinhardt","Matt Feiszli","Yuchen Fan","Laura Leal-Taixe","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2308.15266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14105v2","updated":"2023-08-29T12:43:53Z","published":"2023-08-27T13:22:55Z","title":"Unified and Dynamic Graph for Temporal Character Grouping in Long Videos","summary":" Video temporal character grouping locates appearing moments of major\ncharacters within a video according to their identities. To this end, recent\nworks have evolved from unsupervised clustering to graph-based supervised\nclustering. However, graph methods are built upon the premise of fixed affinity\ngraphs, bringing many inexact connections. Besides, they extract multi-modal\nfeatures with kinds of models, which are unfriendly to deployment. In this\npaper, we present a unified and dynamic graph (UniDG) framework for temporal\ncharacter grouping. This is accomplished firstly by a unified representation\nnetwork that learns representations of multiple modalities within the same\nspace and still preserves the modality's uniqueness simultaneously. Secondly,\nwe present a dynamic graph clustering where the neighbors of different\nquantities are dynamically constructed for each node via a cyclic matching\nstrategy, leading to a more reliable affinity graph. Thirdly, a progressive\nassociation method is introduced to exploit spatial and temporal contexts among\ndifferent modalities, allowing multi-modal clustering results to be well fused.\nAs current datasets only provide pre-extracted features, we evaluate our UniDG\nmethod on a collected dataset named MTCG, which contains each character's\nappearing clips of face and body and speaking voice tracks. We also evaluate\nour key components on existing clustering and retrieval datasets to verify the\ngeneralization ability. Experimental results manifest that our method can\nachieve promising results and outperform several state-of-the-art approaches.\n","authors":["Xiujun Shu","Wei Wen","Liangsheng Xu","Mingbao Lin","Ruizhi Qiao","Taian Guo","Hanjun Li","Bei Gan","Xiao Wang","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15262v1","updated":"2023-08-29T12:41:50Z","published":"2023-08-29T12:41:50Z","title":"Enhancing OCR Performance through Post-OCR Models: Adopting Glyph\n Embedding for Improved Correction","summary":" The study investigates the potential of post-OCR models to overcome\nlimitations in OCR models and explores the impact of incorporating glyph\nembedding on post-OCR correction performance. In this study, we have developed\nour own post-OCR correction model. The novelty of our approach lies in\nembedding the OCR output using CharBERT and our unique embedding technique,\ncapturing the visual characteristics of characters. Our findings show that\npost-OCR correction effectively addresses deficiencies in inferior OCR models,\nand glyph embedding enables the model to achieve superior results, including\nthe ability to correct individual words.\n","authors":["Yung-Hsin Chen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07392v2","updated":"2023-08-29T12:37:04Z","published":"2023-08-14T18:23:18Z","title":"A Unified Query-based Paradigm for Camouflaged Instance Segmentation","summary":" Due to the high similarity between camouflaged instances and the background,\nthe recently proposed camouflaged instance segmentation (CIS) faces challenges\nin accurate localization and instance segmentation. To this end, inspired by\nquery-based transformers, we propose a unified query-based multi-task learning\nframework for camouflaged instance segmentation, termed UQFormer, which builds\na set of mask queries and a set of boundary queries to learn a shared composed\nquery representation and efficiently integrates global camouflaged object\nregion and boundary cues, for simultaneous instance segmentation and instance\nboundary detection in camouflaged scenarios. Specifically, we design a composed\nquery learning paradigm that learns a shared representation to capture object\nregion and boundary features by the cross-attention interaction of mask queries\nand boundary queries in the designed multi-scale unified learning transformer\ndecoder. Then, we present a transformer-based multi-task learning framework for\nsimultaneous camouflaged instance segmentation and camouflaged instance\nboundary detection based on the learned composed query representation, which\nalso forces the model to learn a strong instance-level query representation.\nNotably, our model views the instance segmentation as a query-based direct set\nprediction problem, without other post-processing such as non-maximal\nsuppression. Compared with 14 state-of-the-art approaches, our UQFormer\nsignificantly improves the performance of camouflaged instance segmentation.\nOur code will be available at https://github.com/dongbo811/UQFormer.\n","authors":["Bo Dong","Jialun Pei","Rongrong Gao","Tian-Zhu Xiang","Shuo Wang","Huan Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.07392v2.pdf","comment":"This paper has been accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2304.09121v3","updated":"2023-08-29T12:32:01Z","published":"2023-04-18T16:37:18Z","title":"Fast Neural Scene Flow","summary":" Neural Scene Flow Prior (NSFP) is of significant interest to the vision\ncommunity due to its inherent robustness to out-of-distribution (OOD) effects\nand its ability to deal with dense lidar points. The approach utilizes a\ncoordinate neural network to estimate scene flow at runtime, without any\ntraining. However, it is up to 100 times slower than current state-of-the-art\nlearning methods. In other applications such as image, video, and radiance\nfunction reconstruction innovations in speeding up the runtime performance of\ncoordinate networks have centered upon architectural changes. In this paper, we\ndemonstrate that scene flow is different -- with the dominant computational\nbottleneck stemming from the loss function itself (i.e., Chamfer distance).\nFurther, we rediscover the distance transform (DT) as an efficient,\ncorrespondence-free loss function that dramatically speeds up the runtime\noptimization. Our fast neural scene flow (FNSF) approach reports for the first\ntime real-time performance comparable to learning methods, without any training\nor OOD bias on two of the largest open autonomous driving (AV) lidar datasets\nWaymo Open and Argoverse.\n","authors":["Xueqian Li","Jianqiao Zheng","Francesco Ferroni","Jhony Kaesemodel Pontes","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2304.09121v3.pdf","comment":"17 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.15236v1","updated":"2023-08-29T11:51:27Z","published":"2023-08-29T11:51:27Z","title":"Rotation Augmented Distillation for Exemplar-Free Class Incremental\n Learning with Detailed Analysis","summary":" Class incremental learning (CIL) aims to recognize both the old and new\nclasses along the increment tasks. Deep neural networks in CIL suffer from\ncatastrophic forgetting and some approaches rely on saving exemplars from\nprevious tasks, known as the exemplar-based setting, to alleviate this problem.\nOn the contrary, this paper focuses on the Exemplar-Free setting with no old\nclass sample preserved. Balancing the plasticity and stability in deep feature\nlearning with only supervision from new classes is more challenging. Most\nexisting Exemplar-Free CIL methods report the overall performance only and lack\nfurther analysis. In this work, different methods are examined with\ncomplementary metrics in greater detail. Moreover, we propose a simple CIL\nmethod, Rotation Augmented Distillation (RAD), which achieves one of the\ntop-tier performances under the Exemplar-Free setting. Detailed analysis shows\nour RAD benefits from the superior balance between plasticity and stability.\nFinally, more challenging exemplar-free settings with fewer initial classes are\nundertaken for further demonstrations and comparisons among the\nstate-of-the-art methods.\n","authors":["Xiuwei Chen","Xiaobin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.15236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14397v2","updated":"2023-08-29T11:46:44Z","published":"2023-08-28T08:24:25Z","title":"Ensemble of Anchor-Free Models for Robust Bangla Document Layout\n Segmentation","summary":" In this research paper, we introduce a novel approach designed for the\npurpose of segmenting the layout of Bangla documents. Our methodology involves\nthe utilization of a sophisticated ensemble of YOLOv8 models, which were\ntrained for the DL Sprint 2.0 - BUET CSE Fest 2023 Competition focused on\nBangla document layout segmentation. Our primary emphasis lies in enhancing\nvarious aspects of the task, including techniques such as image augmentation,\nmodel architecture, and the incorporation of model ensembles. We deliberately\nreduce the quality of a subset of document images to enhance the resilience of\nmodel training, thereby resulting in an improvement in our cross-validation\nscore. By employing Bayesian optimization, we determine the optimal confidence\nand Intersection over Union (IoU) thresholds for our model ensemble. Through\nour approach, we successfully demonstrate the effectiveness of anchor-free\nmodels in achieving robust layout segmentation in Bangla documents.\n","authors":["U Mong Sain Chak","Md. Asib Rahman"],"pdf_url":"https://arxiv.org/pdf/2308.14397v2.pdf","comment":"4 pages, 5 figures, 6 Tables"},{"id":"http://arxiv.org/abs/2308.15226v1","updated":"2023-08-29T11:29:43Z","published":"2023-08-29T11:29:43Z","title":"CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for\n Multimodal Machine Translation","summary":" There has been a growing interest in developing multimodal machine\ntranslation (MMT) systems that enhance neural machine translation (NMT) with\nvisual knowledge. This problem setup involves using images as auxiliary\ninformation during training, and more recently, eliminating their use during\ninference. Towards this end, previous works face a challenge in training\npowerful MMT models from scratch due to the scarcity of annotated multilingual\nvision-language data, especially for low-resource languages. Simultaneously,\nthere has been an influx of multilingual pre-trained models for NMT and\nmultimodal pre-trained models for vision-language tasks, primarily in English,\nwhich have shown exceptional generalisation ability. However, these are not\ndirectly applicable to MMT since they do not provide aligned multimodal\nmultilingual features for generative tasks. To alleviate this issue, instead of\ndesigning complex modules for MMT, we propose CLIPTrans, which simply adapts\nthe independently pre-trained multimodal M-CLIP and the multilingual mBART. In\norder to align their embedding spaces, mBART is conditioned on the M-CLIP\nfeatures by a prefix sequence generated through a lightweight mapping network.\nWe train this in a two-stage pipeline which warms up the model with image\ncaptioning before the actual translation task. Through experiments, we\ndemonstrate the merits of this framework and consequently push forward the\nstate-of-the-art across standard benchmarks by an average of +2.67 BLEU. The\ncode can be found at www.github.com/devaansh100/CLIPTrans.\n","authors":["Devaansh Gupta","Siddhant Kharbanda","Jiawei Zhou","Wanhua Li","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2308.15226v1.pdf","comment":"15 pages, 9 figures, to be published In Proceedings of International\n Conference of Computer Vision(ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.15216v1","updated":"2023-08-29T11:12:53Z","published":"2023-08-29T11:12:53Z","title":"Optron: Better Medical Image Registration via Training in the Loop","summary":" Previously, in the field of medical image registration, there are primarily\ntwo paradigms, the traditional optimization-based methods, and the\ndeep-learning-based methods. Each of these paradigms has its advantages, and in\nthis work, we aim to take the best of both worlds. Instead of developing a new\ndeep learning model, we designed a robust training architecture that is simple\nand generalizable. We present Optron, a general training architecture\nincorporating the idea of training-in-the-loop. By iteratively optimizing the\nprediction result of a deep learning model through a plug-and-play optimizer\nmodule in the training loop, Optron introduces pseudo ground truth to an\nunsupervised training process. And by bringing the training process closer to\nthat of supervised training, Optron can consistently improve the models'\nperformance and convergence speed. We evaluated our method on various\ncombinations of models and datasets, and we have achieved state-of-the-art\nperformance on the IXI dataset, improving the previous state-of-the-art method\nTransMorph by a significant margin of +1.6% DSC. Moreover, Optron also\nconsistently achieved positive results with other models and datasets. It\nincreases the validation DSC for VoxelMorph and ViT-V-Net by +2.3% and +2.2%\nrespectively on IXI, demonstrating our method's generalizability. Our\nimplementation is publicly available at\nhttps://github.com/miraclefactory/optron\n","authors":["Yicheng Chen","Shengxiang Ji","Yuelin Xin","Kun Han","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15216v1.pdf","comment":"10 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.03038v2","updated":"2023-08-29T11:08:59Z","published":"2023-06-05T16:53:58Z","title":"HeadSculpt: Crafting 3D Head Avatars with Text","summary":" Recently, text-guided 3D generative methods have made remarkable advancements\nin producing high-quality textures and geometry, capitalizing on the\nproliferation of large vision-language and image diffusion models. However,\nexisting methods still struggle to create high-fidelity 3D head avatars in two\naspects: (1) They rely mostly on a pre-trained text-to-image diffusion model\nwhilst missing the necessary 3D awareness and head priors. This makes them\nprone to inconsistency and geometric distortions in the generated avatars. (2)\nThey fall short in fine-grained editing. This is primarily due to the inherited\nlimitations from the pre-trained 2D image diffusion models, which become more\npronounced when it comes to 3D head avatars. In this work, we address these\nchallenges by introducing a versatile coarse-to-fine pipeline dubbed HeadSculpt\nfor crafting (i.e., generating and editing) 3D head avatars from textual\nprompts. Specifically, we first equip the diffusion model with 3D awareness by\nleveraging landmark-based control and a learned textual embedding representing\nthe back view appearance of heads, enabling 3D-consistent head avatar\ngenerations. We further propose a novel identity-aware editing score\ndistillation strategy to optimize a textured mesh with a high-resolution\ndifferentiable rendering technique. This enables identity preservation while\nfollowing the editing instruction. We showcase HeadSculpt's superior fidelity\nand editing capabilities through comprehensive experiments and comparisons with\nexisting methods.\n","authors":["Xiao Han","Yukang Cao","Kai Han","Xiatian Zhu","Jiankang Deng","Yi-Zhe Song","Tao Xiang","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2306.03038v2.pdf","comment":"Webpage: https://brandonhan.uk/HeadSculpt/"},{"id":"http://arxiv.org/abs/2211.14573v3","updated":"2023-08-29T10:59:41Z","published":"2022-11-26T14:00:18Z","title":"Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation\n for Pretrained Deep Generative Model","summary":" Semantic editing of images is the fundamental goal of computer vision.\nAlthough deep learning methods, such as generative adversarial networks (GANs),\nare capable of producing high-quality images, they often do not have an\ninherent way of editing generated images semantically. Recent studies have\ninvestigated a way of manipulating the latent variable to determine the images\nto be generated. However, methods that assume linear semantic arithmetic have\ncertain limitations in terms of the quality of image editing, whereas methods\nthat discover nonlinear semantic pathways provide non-commutative editing,\nwhich is inconsistent when applied in different orders. This study proposes a\nnovel method called deep curvilinear editing (DeCurvEd) to determine semantic\ncommuting vector fields on the latent space. We theoretically demonstrate that\nowing to commutativity, the editing of multiple attributes depends only on the\nquantities and not on the order. Furthermore, we experimentally demonstrate\nthat compared to previous methods, the nonlinear and commutative nature of\nDeCurvEd facilitates the disentanglement of image attributes and provides\nhigher-quality editing.\n","authors":["Takehiro Aoshima","Takashi Matsubara"],"pdf_url":"https://arxiv.org/pdf/2211.14573v3.pdf","comment":"15 pages. The last update made no changes except for adding the\n following link to the CVF repository:\n https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html.\n Here, you can find our code to reproduce our results"},{"id":"http://arxiv.org/abs/2308.10658v2","updated":"2023-08-29T10:37:26Z","published":"2023-08-21T11:51:46Z","title":"Learning Clothing and Pose Invariant 3D Shape Representation for\n Long-Term Person Re-Identification","summary":" Long-Term Person Re-Identification (LT-ReID) has become increasingly crucial\nin computer vision and biometrics. In this work, we aim to extend LT-ReID\nbeyond pedestrian recognition to include a wider range of real-world human\nactivities while still accounting for cloth-changing scenarios over large time\ngaps. This setting poses additional challenges due to the geometric\nmisalignment and appearance ambiguity caused by the diversity of human pose and\nclothing. To address these challenges, we propose a new approach 3DInvarReID\nfor (i) disentangling identity from non-identity components (pose, clothing\nshape, and texture) of 3D clothed humans, and (ii) reconstructing accurate 3D\nclothed body shapes and learning discriminative features of naked body shapes\nfor person ReID in a joint manner. To better evaluate our study of LT-ReID, we\ncollect a real-world dataset called CCDA, which contains a wide variety of\nhuman activities and clothing changes. Experimentally, we show the superior\nperformance of our approach for person ReID.\n","authors":["Feng Liu","Minchul Kim","ZiAng Gu","Anil Jain","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2308.10658v2.pdf","comment":"10 pages, 7 figures, accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2304.11705v2","updated":"2023-08-29T10:08:24Z","published":"2023-04-23T17:43:29Z","title":"Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR\n Semantic Segmentation","summary":" The ability to deploy robots that can operate safely in diverse environments\nis crucial for developing embodied intelligent agents. As a community, we have\nmade tremendous progress in within-domain LiDAR semantic segmentation. However,\ndo these methods generalize across domains? To answer this question, we design\nthe first experimental setup for studying domain generalization (DG) for LiDAR\nsemantic segmentation (DG-LSS). Our results confirm a significant gap between\nmethods, evaluated in a cross-domain setting: for example, a model trained on\nthe source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data,\ncompared to $48.49$ mIoU obtained by the model trained on the target domain\n(nuScenes). To tackle this gap, we propose the first method specifically\ndesigned for DG-LSS, which obtains $34.88$ mIoU on the target domain,\noutperforming all baselines. Our method augments a sparse-convolutional\nencoder-decoder 3D segmentation network with an additional, dense 2D\nconvolutional decoder that learns to classify a birds-eye view of the point\ncloud. This simple auxiliary task encourages the 3D network to learn features\nthat are robust to sensor placement shifts and resolution, and are transferable\nacross domains. With this work, we aim to inspire the community to develop and\nevaluate future models in such cross-domain conditions.\n","authors":["Cristiano Saltori","Aljoša Ošep","Elisa Ricci","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2304.11705v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15172v1","updated":"2023-08-29T09:54:30Z","published":"2023-08-29T09:54:30Z","title":"Is visual explanation with Grad-CAM more reliable for deeper neural\n networks? a case study with automatic pneumothorax diagnosis","summary":" While deep learning techniques have provided the state-of-the-art performance\nin various clinical tasks, explainability regarding their decision-making\nprocess can greatly enhance the credence of these methods for safer and quicker\nclinical adoption. With high flexibility, Gradient-weighted Class Activation\nMapping (Grad-CAM) has been widely adopted to offer intuitive visual\ninterpretation of various deep learning models' reasoning processes in\ncomputer-assisted diagnosis. However, despite the popularity of the technique,\nthere is still a lack of systematic study on Grad-CAM's performance on\ndifferent deep learning architectures. In this study, we investigate its\nrobustness and effectiveness across different popular deep learning models,\nwith a focus on the impact of the networks' depths and architecture types, by\nusing a case study of automatic pneumothorax diagnosis in X-ray scans. Our\nresults show that deeper neural networks do not necessarily contribute to a\nstrong improvement of pneumothorax diagnosis accuracy, and the effectiveness of\nGradCAM also varies among different network architectures.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.15172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15170v1","updated":"2023-08-29T09:53:10Z","published":"2023-08-29T09:53:10Z","title":"A lightweight 3D dense facial landmark estimation model from position\n map data","summary":" The incorporation of 3D data in facial analysis tasks has gained popularity\nin recent years. Though it provides a more accurate and detailed representation\nof the human face, accruing 3D face data is more complex and expensive than 2D\nface images. Either one has to rely on expensive 3D scanners or depth sensors\nwhich are prone to noise. An alternative option is the reconstruction of 3D\nfaces from uncalibrated 2D images in an unsupervised way without any ground\ntruth 3D data. However, such approaches are computationally expensive and the\nlearned model size is not suitable for mobile or other edge device\napplications. Predicting dense 3D landmarks over the whole face can overcome\nthis issue. As there is no public dataset available containing dense landmarks,\nwe propose a pipeline to create a dense keypoint training dataset containing\n520 key points across the whole face from an existing facial position map data.\nWe train a lightweight MobileNet-based regressor model with the generated data.\nAs we do not have access to any evaluation dataset with dense landmarks in it\nwe evaluate our model against the 68 keypoint detection task. Experimental\nresults show that our trained model outperforms many of the existing methods in\nspite of its lower model size and minimal computational cost. Also, the\nqualitative evaluation shows the efficiency of our trained models in extreme\nhead pose angles as well as other facial variations and occlusions.\n","authors":["Shubhajit Basak","Sathish Mangapuram","Gabriel Costache","Rachel McDonnell","Michael Schukat"],"pdf_url":"https://arxiv.org/pdf/2308.15170v1.pdf","comment":"8 pages, The Irish Machine Vision and Image Processing\n Conference(IMVIP)"},{"id":"http://arxiv.org/abs/2308.15169v1","updated":"2023-08-29T09:52:32Z","published":"2023-08-29T09:52:32Z","title":"Uncovering the Unseen: Discover Hidden Intentions by Micro-Behavior\n Graph Reasoning","summary":" This paper introduces a new and challenging Hidden Intention Discovery (HID)\ntask. Unlike existing intention recognition tasks, which are based on obvious\nvisual representations to identify common intentions for normal behavior, HID\nfocuses on discovering hidden intentions when humans try to hide their\nintentions for abnormal behavior. HID presents a unique challenge in that\nhidden intentions lack the obvious visual representations to distinguish them\nfrom normal intentions. Fortunately, from a sociological and psychological\nperspective, we find that the difference between hidden and normal intentions\ncan be reasoned from multiple micro-behaviors, such as gaze, attention, and\nfacial expressions. Therefore, we first discover the relationship between\nmicro-behavior and hidden intentions and use graph structure to reason about\nhidden intentions. To facilitate research in the field of HID, we also\nconstructed a seminal dataset containing a hidden intention annotation of a\ntypical theft scenario for HID. Extensive experiments show that the proposed\nnetwork improves performance on the HID task by 9.9\\% over the state-of-the-art\nmethod SBP.\n","authors":["Zhuo Zhou","Wenxuan Liu","Danni Xu","Zheng Wang","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.15169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07700v2","updated":"2023-08-29T09:49:34Z","published":"2022-12-15T10:23:32Z","title":"Colab NAS: Obtaining lightweight task-specific convolutional neural\n networks following Occam's razor","summary":" The current trend of applying transfer learning from convolutional neural\nnetworks (CNNs) trained on large datasets can be an overkill when the target\napplication is a custom and delimited problem, with enough data to train a\nnetwork from scratch. On the other hand, the training of custom and lighter\nCNNs requires expertise, in the from-scratch case, and or high-end resources,\nas in the case of hardware-aware neural architecture search (HW NAS), limiting\naccess to the technology by non-habitual NN developers.\n For this reason, we present ColabNAS, an affordable HW NAS technique for\nproducing lightweight task-specific CNNs. Its novel derivative-free search\nstrategy, inspired by Occam's razor, allows to obtain state-of-the-art results\non the Visual Wake Word dataset, a standard TinyML benchmark, in just 3.1 GPU\nhours using free online GPU services such as Google Colaboratory and Kaggle\nKernel.\n","authors":["Andrea Mattia Garavagno","Daniele Leonardis","Antonio Frisoli"],"pdf_url":"https://arxiv.org/pdf/2212.07700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14672v2","updated":"2023-08-29T09:33:59Z","published":"2023-03-26T10:15:33Z","title":"Sat2Density: Faithful Density Learning from Satellite-Ground Image Pairs","summary":" This paper aims to develop an accurate 3D geometry representation of\nsatellite images using satellite-ground image pairs. Our focus is on the\nchallenging problem of 3D-aware ground-views synthesis from a satellite image.\nWe draw inspiration from the density field representation used in volumetric\nneural rendering and propose a new approach, called Sat2Density. Our method\nutilizes the properties of ground-view panoramas for the sky and non-sky\nregions to learn faithful density fields of 3D scenes in a geometric\nperspective. Unlike other methods that require extra depth information during\ntraining, our Sat2Density can automatically learn accurate and faithful 3D\ngeometry via density representation without depth supervision. This advancement\nsignificantly improves the ground-view panorama synthesis task. Additionally,\nour study provides a new geometric perspective to understand the relationship\nbetween satellite and ground-view images in 3D space.\n","authors":["Ming Qian","Jincheng Xiong","Gui-Song Xia","Nan Xue"],"pdf_url":"https://arxiv.org/pdf/2303.14672v2.pdf","comment":"ICCV 2023, project page: https://sat2density.github.io/, code:\n https://github.com/qianmingduowan/Sat2Density"},{"id":"http://arxiv.org/abs/2308.15142v1","updated":"2023-08-29T09:21:48Z","published":"2023-08-29T09:21:48Z","title":"A Multimodal Visual Encoding Model Aided by Introducing Verbal Semantic\n Information","summary":" Biological research has revealed that the verbal semantic information in the\nbrain cortex, as an additional source, participates in nonverbal semantic\ntasks, such as visual encoding. However, previous visual encoding models did\nnot incorporate verbal semantic information, contradicting this biological\nfinding. This paper proposes a multimodal visual information encoding network\nmodel based on stimulus images and associated textual information in response\nto this issue. Our visual information encoding network model takes stimulus\nimages as input and leverages textual information generated by a text-image\ngeneration model as verbal semantic information. This approach injects new\ninformation into the visual encoding model. Subsequently, a Transformer network\naligns image and text feature information, creating a multimodal feature space.\nA convolutional network then maps from this multimodal feature space to voxel\nspace, constructing the multimodal visual information encoding network model.\nExperimental results demonstrate that the proposed multimodal visual\ninformation encoding network model outperforms previous models under the exact\ntraining cost. In voxel prediction of the left hemisphere of subject 1's brain,\nthe performance improves by approximately 15.87%, while in the right\nhemisphere, the performance improves by about 4.6%. The multimodal visual\nencoding network model exhibits superior encoding performance. Additionally,\nablation experiments indicate that our proposed model better simulates the\nbrain's visual information processing.\n","authors":["Shuxiao Ma","Linyuan Wang","Bin Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15141v1","updated":"2023-08-29T09:19:49Z","published":"2023-08-29T09:19:49Z","title":"Uncertainty Aware Training to Improve Deep Learning Model Calibration\n for Classification of Cardiac MR Images","summary":" Quantifying uncertainty of predictions has been identified as one way to\ndevelop more trustworthy artificial intelligence (AI) models beyond\nconventional reporting of performance metrics. When considering their role in a\nclinical decision support setting, AI classification models should ideally\navoid confident wrong predictions and maximise the confidence of correct\npredictions. Models that do this are said to be well-calibrated with regard to\nconfidence. However, relatively little attention has been paid to how to\nimprove calibration when training these models, i.e., to make the training\nstrategy uncertainty-aware. In this work we evaluate three novel\nuncertainty-aware training strategies comparing against two state-of-the-art\napproaches. We analyse performance on two different clinical applications:\ncardiac resynchronisation therapy (CRT) response prediction and coronary artery\ndisease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The\nbest-performing model in terms of both classification accuracy and the most\ncommon calibration measure, expected calibration error (ECE) was the Confidence\nWeight method, a novel approach that weights the loss of samples to explicitly\npenalise confident incorrect predictions. The method reduced the ECE by 17% for\nCRT response prediction and by 22% for CAD diagnosis when compared to a\nbaseline classifier in which no uncertainty-aware strategy was included. In\nboth applications, as well as reducing the ECE there was a slight increase in\naccuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD\ndiagnosis respectively. However, our analysis showed a lack of consistency in\nterms of optimal models when using different calibration measures. This\nindicates the need for careful consideration of performance metrics when\ntraining and selecting models for complex high-risk applications in healthcare.\n","authors":["Tareen Dawood","Chen Chen","Baldeep S. Sidhua","Bram Ruijsink","Justin Goulda","Bradley Porter","Mark K. Elliott","Vishal Mehta","Christopher A. Rinaldi","Esther Puyol-Anton","Reza Razavi","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2308.15141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14619v2","updated":"2023-08-29T09:16:48Z","published":"2023-08-28T14:43:36Z","title":"Compositional Semantic Mix for Domain Adaptation in Point Cloud\n Segmentation","summary":" Deep-learning models for 3D point cloud semantic segmentation exhibit limited\ngeneralization capabilities when trained and tested on data captured with\ndifferent sensors or in varying environments due to domain shift. Domain\nadaptation methods can be employed to mitigate this domain shift, for instance,\nby simulating sensor noise, developing domain-agnostic generators, or training\npoint cloud completion networks. Often, these methods are tailored for range\nview maps or necessitate multi-modal input. In contrast, domain adaptation in\nthe image domain can be executed through sample mixing, which emphasizes input\ndata manipulation rather than employing distinct adaptation modules. In this\nstudy, we introduce compositional semantic mixing for point cloud domain\nadaptation, representing the first unsupervised domain adaptation technique for\npoint cloud segmentation based on semantic and geometric sample mixing. We\npresent a two-branch symmetric network architecture capable of concurrently\nprocessing point clouds from a source domain (e.g. synthetic) and point clouds\nfrom a target domain (e.g. real-world). Each branch operates within one domain\nby integrating selected data fragments from the other domain and utilizing\nsemantic information derived from source labels and target (pseudo) labels.\nAdditionally, our method can leverage a limited number of human point-level\nannotations (semi-supervised) to further enhance performance. We assess our\napproach in both synthetic-to-real and real-to-real scenarios using LiDAR\ndatasets and demonstrate that it significantly outperforms state-of-the-art\nmethods in both unsupervised and semi-supervised settings.\n","authors":["Cristiano Saltori","Fabio Galasso","Giuseppe Fiameni","Nicu Sebe","Fabio Poiesi","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2308.14619v2.pdf","comment":"TPAMI. arXiv admin note: text overlap with arXiv:2207.09778"},{"id":"http://arxiv.org/abs/2308.15137v1","updated":"2023-08-29T09:13:24Z","published":"2023-08-29T09:13:24Z","title":"Abdominal Multi-Organ Segmentation Based on Feature Pyramid Network and\n Spatial Recurrent Neural Network","summary":" As recent advances in AI are causing the decline of conventional diagnostic\nmethods, the realization of end-to-end diagnosis is fast approaching.\nUltrasound image segmentation is an important step in the diagnostic process.\nAn accurate and robust segmentation model accelerates the process and reduces\nthe burden of sonographers. In contrast to previous research, we take two\ninherent features of ultrasound images into consideration: (1) different organs\nand tissues vary in spatial sizes, (2) the anatomical structures inside human\nbody form a relatively constant spatial relationship. Based on those two ideas,\nwe propose a new image segmentation model combining Feature Pyramid Network\n(FPN) and Spatial Recurrent Neural Network (SRNN). We discuss why we use FPN to\nextract anatomical structures of different scales and how SRNN is implemented\nto extract the spatial context features in abdominal ultrasound images.\n","authors":["Yuhan Song","Armagan Elibol","Nak Young Chong"],"pdf_url":"https://arxiv.org/pdf/2308.15137v1.pdf","comment":"IFAC World Congress 2023 paper"},{"id":"http://arxiv.org/abs/2308.15136v1","updated":"2023-08-29T09:10:53Z","published":"2023-08-29T09:10:53Z","title":"CAGRA: Highly Parallel Graph Construction and Approximate Nearest\n Neighbor Search for GPUs","summary":" Approximate Nearest Neighbor Search (ANNS) plays a critical role in various\ndisciplines spanning data mining and artificial intelligence, from information\nretrieval and computer vision to natural language processing and recommender\nsystems. Data volumes have soared in recent years and the computational cost of\nan exhaustive exact nearest neighbor search is often prohibitive, necessitating\nthe adoption of approximate techniques. The balanced performance and recall of\ngraph-based approaches have more recently garnered significant attention in\nANNS algorithms, however, only a few studies have explored harnessing the power\nof GPUs and multi-core processors despite the widespread use of massively\nparallel and general-purpose computing. To bridge this gap, we introduce a\nnovel parallel computing hardware-based proximity graph and search algorithm.\nBy leveraging the high-performance capabilities of modern hardware, our\napproach achieves remarkable efficiency gains. In particular, our method\nsurpasses existing CPU and GPU-based methods in constructing the proximity\ngraph, demonstrating higher throughput in both large- and small-batch searches\nwhile maintaining compatible accuracy. In graph construction time, our method,\nCAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA\nimplementations. In large-batch query throughput in the 90% to 95% recall\nrange, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the\nSOTA implementations for GPU. For a single query, our method is 3.4~53x faster\nthan HNSW at 95% recall.\n","authors":["Hiroyuki Ootomo","Akira Naruse","Corey Nolet","Ray Wang","Tamas Feher","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.08015v2","updated":"2023-08-29T09:05:58Z","published":"2022-08-17T01:44:32Z","title":"Cross-Domain Few-Shot Classification via Inter-Source Stylization","summary":" The goal of Cross-Domain Few-Shot Classification (CDFSC) is to accurately\nclassify a target dataset with limited labelled data by exploiting the\nknowledge of a richly labelled auxiliary dataset, despite the differences\nbetween the domains of the two datasets. Some existing approaches require\nlabelled samples from multiple domains for model training. However, these\nmethods fail when the sample labels are scarce. To overcome this challenge,\nthis paper proposes a solution that makes use of multiple source domains\nwithout the need for additional labeling costs. Specifically, one of the source\ndomains is completely tagged, while the others are untagged. An Inter-Source\nStylization Network (ISSNet) is then introduced to enhance stylisation across\nmultiple source domains, enriching data distribution and model's generalization\ncapabilities. Experiments on 8 target datasets show that ISSNet leverages\nunlabelled data from multiple source data and significantly reduces the\nnegative impact of domain gaps on classification performance compared to\nseveral baseline methods.\n","authors":["Huali Xu","Shuaifeng Zhi","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2208.08015v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.13495v2","updated":"2023-08-29T09:01:13Z","published":"2023-08-25T17:10:22Z","title":"Open Gaze: Open Source eye tracker for smartphone devices using Deep\n Learning","summary":" Eye tracking has been a pivotal tool in diverse fields such as vision\nresearch, language analysis, and usability assessment. The majority of prior\ninvestigations, however, have concentrated on expansive desktop displays\nemploying specialized, costly eye tracking hardware that lacks scalability.\nRemarkably little insight exists into ocular movement patterns on smartphones,\ndespite their widespread adoption and significant usage. In this manuscript, we\npresent an open-source implementation of a smartphone-based gaze tracker that\nemulates the methodology proposed by a GooglePaper (whose source code remains\nproprietary). Our focus is on attaining accuracy comparable to that attained\nthrough the GooglePaper's methodology, without the necessity for supplementary\nhardware. Through the integration of machine learning techniques, we unveil an\naccurate eye tracking solution that is native to smartphones. Our approach\ndemonstrates precision akin to the state-of-the-art mobile eye trackers, which\nare characterized by a cost that is two orders of magnitude higher. Leveraging\nthe vast MIT GazeCapture dataset, which is available through registration on\nthe dataset's website, we successfully replicate crucial findings from previous\nstudies concerning ocular motion behavior in oculomotor tasks and saliency\nanalyses during natural image observation. Furthermore, we emphasize the\napplicability of smartphone-based gaze tracking in discerning reading\ncomprehension challenges. Our findings exhibit the inherent potential to\namplify eye movement research by significant proportions, accommodating\nparticipation from thousands of subjects with explicit consent. This\nscalability not only fosters advancements in vision research, but also extends\nits benefits to domains such as accessibility enhancement and healthcare\napplications.\n","authors":["Sushmanth reddy","Jyothi Swaroop Reddy"],"pdf_url":"https://arxiv.org/pdf/2308.13495v2.pdf","comment":"26 pages , 15 figures"},{"id":"http://arxiv.org/abs/2308.08730v3","updated":"2023-08-29T08:52:58Z","published":"2023-08-17T01:59:59Z","title":"Learning A Coarse-to-Fine Diffusion Transformer for Image Restoration","summary":" Recent years have witnessed the remarkable performance of diffusion models in\nvarious vision tasks. However, for image restoration that aims to recover clear\nimages with sharper details from given degraded observations, diffusion-based\nmethods may fail to recover promising results due to inaccurate noise\nestimation. Moreover, simple constraining noises cannot effectively learn\ncomplex degradation information, which subsequently hinders the model capacity.\nTo solve the above problems, we propose a coarse-to-fine diffusion Transformer\n(C2F-DFT) for image restoration. Specifically, our C2F-DFT contains diffusion\nself-attention (DFSA) and diffusion feed-forward network (DFN) within a new\ncoarse-to-fine training scheme. The DFSA and DFN respectively capture the\nlong-range diffusion dependencies and learn hierarchy diffusion representation\nto facilitate better restoration. In the coarse training stage, our C2F-DFT\nestimates noises and then generates the final clean image by a sampling\nalgorithm. To further improve the restoration quality, we propose a simple yet\neffective fine training scheme. It first exploits the coarse-trained diffusion\nmodel with fixed steps to generate restoration results, which then would be\nconstrained with corresponding ground-truth ones to optimize the models to\nremedy the unsatisfactory results affected by inaccurate noise estimation.\nExtensive experiments show that C2F-DFT significantly outperforms\ndiffusion-based restoration method IR-SDE and achieves competitive performance\ncompared with Transformer-based state-of-the-art methods on $3$ tasks,\nincluding deraining, deblurring, and real denoising. The code is available at\nhttps://github.com/wlydlut/C2F-DFT.\n","authors":["Liyan Wang","Qinyu Yang","Cong Wang","Wei Wang","Jinshan Pan","Zhixun Su"],"pdf_url":"https://arxiv.org/pdf/2308.08730v3.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.15126v1","updated":"2023-08-29T08:51:24Z","published":"2023-08-29T08:51:24Z","title":"Evaluation and Analysis of Hallucination in Large Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) have recently achieved remarkable\nsuccess. However, LVLMs are still plagued by the hallucination problem, which\nlimits the practicality in many scenarios. Hallucination refers to the\ninformation of LVLMs' responses that does not exist in the visual input, which\nposes potential risks of substantial consequences. There has been limited work\nstudying hallucination evaluation in LVLMs. In this paper, we propose\nHallucination Evaluation based on Large Language Models (HaELM), an LLM-based\nhallucination evaluation framework. HaELM achieves an approximate 95%\nperformance comparable to ChatGPT and has additional advantages including low\ncost, reproducibility, privacy preservation and local deployment. Leveraging\nthe HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we\nanalyze the factors contributing to hallucination in LVLMs and offer helpful\nsuggestions to mitigate the hallucination problem. Our training data and human\nannotation hallucination data will be made public soon.\n","authors":["Junyang Wang","Yiyang Zhou","Guohai Xu","Pengcheng Shi","Chenlin Zhao","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Jihua Zhu","Jitao Sang","Haoyu Tang"],"pdf_url":"https://arxiv.org/pdf/2308.15126v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.15119v1","updated":"2023-08-29T08:37:16Z","published":"2023-08-29T08:37:16Z","title":"AI-Based Facial Emotion Recognition Solutions for Education: A Study of\n Teacher-User and Other Categories","summary":" Existing information on AI-based facial emotion recognition (FER) is not\neasily comprehensible by those outside the field of computer science, requiring\ncross-disciplinary effort to determine a categorisation framework that promotes\nthe understanding of this technology, and its impact on users. Most proponents\nclassify FER in terms of methodology, implementation and analysis; relatively\nfew by its application in education; and none by its users. This paper is\nconcerned primarily with (potential) teacher-users of FER tools for education.\nIt proposes a three-part classification of these teachers, by orientation,\ncondition and preference, based on a classical taxonomy of affective\neducational objectives, and related theories. It also compiles and organises\nthe types of FER solutions found in or inferred from the literature into\n\"technology\" and \"applications\" categories, as a prerequisite for structuring\nthe proposed \"teacher-user\" category. This work has implications for\nproponents', critics', and users' understanding of the relationship between\nteachers and FER.\n","authors":["R. Yamamoto Ravenor"],"pdf_url":"https://arxiv.org/pdf/2308.15119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13004v2","updated":"2023-08-29T08:35:07Z","published":"2022-10-24T07:50:02Z","title":"Efficient Representation of Natural Image Patches","summary":" In the complex domain of neural information processing, discerning\nfundamental principles from ancillary details remains a significant challenge.\nWhile there is extensive knowledge about the anatomy and physiology of the\nearly visual system, a comprehensive computational theory remains elusive. Can\nwe gain insights into the underlying principles of a biological system by\nabstracting away from its detailed implementation and focusing on the\nfundamental problems that the system is designed to solve? Utilizing an\nabstract model based on minimal yet realistic assumptions, we show how to\nachieve the early visual system's two ultimate objectives: efficient\ninformation transmission and sensor probability distribution modeling. We show\nthat optimizing for information transmission does not yield optimal probability\ndistribution modeling. We illustrate, using a two-pixel (2D) system and image\npatches, that an efficient representation can be realized via nonlinear\npopulation code driven by two types of biologically plausible loss functions\nthat depend solely on output. After unsupervised learning, our abstract IPU\nmodel bears remarkable resemblances to biological systems, despite not\nmimicking many features of real neurons, such as spiking activity. A\npreliminary comparison with a contemporary deep learning model suggests that\nthe IPU model offers a significant efficiency advantage. Our model provides\nnovel insights into the computational theory of early visual systems as well as\na potential new approach to enhance the efficiency of deep learning models.\n","authors":["Cheng Guo"],"pdf_url":"https://arxiv.org/pdf/2210.13004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00647v3","updated":"2023-08-29T08:34:40Z","published":"2022-10-02T22:45:11Z","title":"IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable\n Novel View Synthesis","summary":" Existing inverse rendering combined with neural rendering methods can only\nperform editable novel view synthesis on object-specific scenes, while we\npresent intrinsic neural radiance fields, dubbed IntrinsicNeRF, which introduce\nintrinsic decomposition into the NeRF-based neural rendering method and can\nextend its application to room-scale scenes. Since intrinsic decomposition is a\nfundamentally under-constrained inverse problem, we propose a novel\ndistance-aware point sampling and adaptive reflectance iterative clustering\noptimization method, which enables IntrinsicNeRF with traditional intrinsic\ndecomposition constraints to be trained in an unsupervised manner, resulting in\nmulti-view consistent intrinsic decomposition results. To cope with the problem\nthat different adjacent instances of similar reflectance in a scene are\nincorrectly clustered together, we further propose a hierarchical clustering\nmethod with coarse-to-fine optimization to obtain a fast hierarchical indexing\nrepresentation. It supports compelling real-time augmented applications such as\nrecoloring and illumination variation. Extensive experiments and editing\nsamples on both object-specific/room-scale scenes and synthetic/real-word data\ndemonstrate that we can obtain consistent intrinsic decomposition results and\nhigh-fidelity novel view synthesis even for challenging sequences.\n","authors":["Weicai Ye","Shuo Chen","Chong Bao","Hujun Bao","Marc Pollefeys","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.00647v3.pdf","comment":"Accepted to ICCV2023, Project webpage:\n https://zju3dv.github.io/intrinsic_nerf/, code:\n https://github.com/zju3dv/IntrinsicNeRF"},{"id":"http://arxiv.org/abs/2308.15109v1","updated":"2023-08-29T08:20:23Z","published":"2023-08-29T08:20:23Z","title":"DiffusionVMR: Diffusion Model for Video Moment Retrieval","summary":" Video moment retrieval is a fundamental visual-language task that aims to\nretrieve target moments from an untrimmed video based on a language query.\nExisting methods typically generate numerous proposals manually or via\ngenerative networks in advance as the support set for retrieval, which is not\nonly inflexible but also time-consuming. Inspired by the success of diffusion\nmodels on object detection, this work aims at reformulating video moment\nretrieval as a denoising generation process to get rid of the inflexible and\ntime-consuming proposal generation. To this end, we propose a novel\nproposal-free framework, namely DiffusionVMR, which directly samples random\nspans from noise as candidates and introduces denoising learning to ground\ntarget moments. During training, Gaussian noise is added to the real moments,\nand the model is trained to learn how to reverse this process. In inference, a\nset of time spans is progressively refined from the initial noise to the final\noutput. Notably, the training and inference of DiffusionVMR are decoupled, and\nan arbitrary number of random spans can be used in inference without being\nconsistent with the training phase. Extensive experiments conducted on three\nwidely-used benchmarks (i.e., QVHighlight, Charades-STA, and TACoS) demonstrate\nthe effectiveness of the proposed DiffusionVMR by comparing it with\nstate-of-the-art methods.\n","authors":["Henghao Zhao","Kevin Qinghong Lin","Rui Yan","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2308.15109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15094v1","updated":"2023-08-29T08:02:41Z","published":"2023-08-29T08:02:41Z","title":"Group-Conditional Conformal Prediction via Quantile Regression\n Calibration for Crop and Weed Classification","summary":" As deep learning predictive models become an integral part of a large\nspectrum of precision agricultural systems, a barrier to the adoption of such\nautomated solutions is the lack of user trust in these highly complex, opaque\nand uncertain models. Indeed, deep neural networks are not equipped with any\nexplicit guarantees that can be used to certify the system's performance,\nespecially in highly varying uncontrolled environments such as the ones\ntypically faced in computer vision for agriculture.Fortunately, certain methods\ndeveloped in other communities can prove to be important for agricultural\napplications. This article presents the conformal prediction framework that\nprovides valid statistical guarantees on the predictive performance of any\nblack box prediction machine, with almost no assumptions, applied to the\nproblem of deep visual classification of weeds and crops in real-world\nconditions. The framework is exposed with a focus on its practical aspects and\nspecial attention accorded to the Adaptive Prediction Sets (APS) approach that\ndelivers marginal guarantees on the model's coverage. Marginal results are then\nshown to be insufficient to guarantee performance on all groups of individuals\nin the population as characterized by their environmental and pedo-climatic\nauxiliary data gathered during image acquisition.To tackle this shortcoming,\ngroup-conditional conformal approaches are presented: the ''classical'' method\nthat consists of iteratively applying the APS procedure on all groups, and a\nproposed elegant reformulation and implementation of the procedure using\nquantile regression on group membership indicators. Empirical results showing\nthe validity of the proposed approach are presented and compared to the\nmarginal APS then discussed.\n","authors":["Paul Melki","Lionel Bombrun","Boubacar Diallo","Jérôme Dias","Jean-Pierre da Costa"],"pdf_url":"https://arxiv.org/pdf/2308.15094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14308v3","updated":"2023-08-29T07:58:49Z","published":"2022-11-25T18:59:46Z","title":"WALDO: Future Video Synthesis using Object Layer Decomposition and\n Parametric Flow Prediction","summary":" This paper presents WALDO (WArping Layer-Decomposed Objects), a novel\napproach to the prediction of future video frames from past ones. Individual\nimages are decomposed into multiple layers combining object masks and a small\nset of control points. The layer structure is shared across all frames in each\nvideo to build dense inter-frame connections. Complex scene motions are modeled\nby combining parametric geometric transformations associated with individual\nlayers, and video synthesis is broken down into discovering the layers\nassociated with past frames, predicting the corresponding transformations for\nupcoming ones and warping the associated object regions accordingly, and\nfilling in the remaining image parts. Extensive experiments on multiple\nbenchmarks including urban videos (Cityscapes and KITTI) and videos featuring\nnonrigid motions (UCF-Sports and H3.6M), show that our method consistently\noutperforms the state of the art by a significant margin in every case. Code,\npretrained models, and video samples synthesized by our approach can be found\nin the project webpage https://16lemoing.github.io/waldo.\n","authors":["Guillaume Le Moing","Jean Ponce","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2211.14308v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11851v2","updated":"2023-08-29T07:57:20Z","published":"2023-03-21T13:49:49Z","title":"Sample4Geo: Hard Negative Sampling For Cross-View Geo-Localisation","summary":" Cross-View Geo-Localisation is still a challenging task where additional\nmodules, specific pre-processing or zooming strategies are necessary to\ndetermine accurate positions of images. Since different views have different\ngeometries, pre-processing like polar transformation helps to merge them.\nHowever, this results in distorted images which then have to be rectified.\nAdding hard negatives to the training batch could improve the overall\nperformance but with the default loss functions in geo-localisation it is\ndifficult to include them. In this article, we present a simplified but\neffective architecture based on contrastive learning with symmetric InfoNCE\nloss that outperforms current state-of-the-art results. Our framework consists\nof a narrow training pipeline that eliminates the need of using aggregation\nmodules, avoids further pre-processing steps and even increases the\ngeneralisation capability of the model to unknown regions. We introduce two\ntypes of sampling strategies for hard negatives. The first explicitly exploits\ngeographically neighboring locations to provide a good starting point. The\nsecond leverages the visual similarity between the image embeddings in order to\nmine hard negative samples. Our work shows excellent performance on common\ncross-view datasets like CVUSA, CVACT, University-1652 and VIGOR. A comparison\nbetween cross-area and same-area settings demonstrate the good generalisation\ncapability of our model.\n","authors":["Fabian Deuser","Konrad Habel","Norbert Oswald"],"pdf_url":"https://arxiv.org/pdf/2303.11851v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15088v1","updated":"2023-08-29T07:51:36Z","published":"2023-08-29T07:51:36Z","title":"Using deep learning for an automatic detection and classification of the\n vascular bifurcations along the Circle of Willis","summary":" Most of the intracranial aneurysms (ICA) occur on a specific portion of the\ncerebral vascular tree named the Circle of Willis (CoW). More particularly,\nthey mainly arise onto fifteen of the major arterial bifurcations constituting\nthis circular structure. Hence, for an efficient and timely diagnosis it is\ncritical to develop some methods being able to accurately recognize each\nBifurcation of Interest (BoI). Indeed, an automatic extraction of the\nbifurcations presenting the higher risk of developing an ICA would offer the\nneuroradiologists a quick glance at the most alarming areas. Due to the recent\nefforts on Artificial Intelligence, Deep Learning turned out to be the best\nperforming technology for many pattern recognition tasks. Moreover, various\nmethods have been particularly designed for medical image analysis purposes.\nThis study intends to assist the neuroradiologists to promptly locate any\nbifurcation presenting a high risk of ICA occurrence. It can be seen as a\nComputer Aided Diagnosis scheme, where the Artificial Intelligence facilitates\nthe access to the regions of interest within the MRI. In this work, we propose\na method for a fully automatic detection and recognition of the bifurcations of\ninterest forming the Circle of Willis. Several neural networks architectures\nhave been tested, and we thoroughly evaluate the bifurcation recognition rate.\n","authors":["Rafic Nader","Romain Bourcier","Florent Autrusseau"],"pdf_url":"https://arxiv.org/pdf/2308.15088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15085v1","updated":"2023-08-29T07:50:11Z","published":"2023-08-29T07:50:11Z","title":"Learning to Upsample by Learning to Sample","summary":" We present DySample, an ultra-lightweight and effective dynamic upsampler.\nWhile impressive performance gains have been witnessed from recent kernel-based\ndynamic upsamplers such as CARAFE, FADE, and SAPA, they introduce much\nworkload, mostly due to the time-consuming dynamic convolution and the\nadditional sub-network used to generate dynamic kernels. Further, the need for\nhigh-res feature guidance of FADE and SAPA somehow limits their application\nscenarios. To address these concerns, we bypass dynamic convolution and\nformulate upsampling from the perspective of point sampling, which is more\nresource-efficient and can be easily implemented with the standard built-in\nfunction in PyTorch. We first showcase a naive design, and then demonstrate how\nto strengthen its upsampling behavior step by step towards our new upsampler,\nDySample. Compared with former kernel-based dynamic upsamplers, DySample\nrequires no customized CUDA package and has much fewer parameters, FLOPs, GPU\nmemory, and latency. Besides the light-weight characteristics, DySample\noutperforms other upsamplers across five dense prediction tasks, including\nsemantic segmentation, object detection, instance segmentation, panoptic\nsegmentation, and monocular depth estimation. Code is available at\nhttps://github.com/tiny-smart/dysample.\n","authors":["Wenze Liu","Hao Lu","Hongtao Fu","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2308.15085v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15081v1","updated":"2023-08-29T07:29:30Z","published":"2023-08-29T07:29:30Z","title":"Class Prior-Free Positive-Unlabeled Learning with Taylor Variational\n Loss for Hyperspectral Remote Sensing Imagery","summary":" Positive-unlabeled learning (PU learning) in hyperspectral remote sensing\nimagery (HSI) is aimed at learning a binary classifier from positive and\nunlabeled data, which has broad prospects in various earth vision applications.\nHowever, when PU learning meets limited labeled HSI, the unlabeled data may\ndominate the optimization process, which makes the neural networks overfit the\nunlabeled data. In this paper, a Taylor variational loss is proposed for HSI PU\nlearning, which reduces the weight of the gradient of the unlabeled data by\nTaylor series expansion to enable the network to find a balance between\noverfitting and underfitting. In addition, the self-calibrated optimization\nstrategy is designed to stabilize the training process. Experiments on 7\nbenchmark datasets (21 tasks in total) validate the effectiveness of the\nproposed method. Code is at: https://github.com/Hengwei-Zhao96/T-HOneCls.\n","authors":["Hengwei Zhao","Xinyu Wang","Jingtao Li","Yanfei Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.15081v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15074v1","updated":"2023-08-29T07:15:57Z","published":"2023-08-29T07:15:57Z","title":"Exploring Model Transferability through the Lens of Potential Energy","summary":" Transfer learning has become crucial in computer vision tasks due to the vast\navailability of pre-trained deep learning models. However, selecting the\noptimal pre-trained model from a diverse pool for a specific downstream task\nremains a challenge. Existing methods for measuring the transferability of\npre-trained models rely on statistical correlations between encoded static\nfeatures and task labels, but they overlook the impact of underlying\nrepresentation dynamics during fine-tuning, leading to unreliable results,\nespecially for self-supervised models. In this paper, we present an insightful\nphysics-inspired approach named PED to address these challenges. We reframe the\nchallenge of model selection through the lens of potential energy and directly\nmodel the interaction forces that influence fine-tuning dynamics. By capturing\nthe motion of dynamic representations to decline the potential energy within a\nforce-driven physical model, we can acquire an enhanced and more stable\nobservation for estimating transferability. The experimental results on 10\ndownstream tasks and 12 self-supervised models demonstrate that our approach\ncan seamlessly integrate into existing ranking techniques and enhance their\nperformances, revealing its effectiveness for the model selection task and its\npotential for understanding the mechanism in transfer learning. Code will be\navailable at https://github.com/lixiaotong97/PED.\n","authors":["Xiaotong Li","Zixuan Hu","Yixiao Ge","Ying Shan","Ling-Yu Duan"],"pdf_url":"https://arxiv.org/pdf/2308.15074v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15070v1","updated":"2023-08-29T07:11:52Z","published":"2023-08-29T07:11:52Z","title":"DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior","summary":" We present DiffBIR, which leverages pretrained text-to-image diffusion models\nfor blind image restoration problem. Our framework adopts a two-stage pipeline.\nIn the first stage, we pretrain a restoration module across diversified\ndegradations to improve generalization capability in real-world scenarios. The\nsecond stage leverages the generative ability of latent diffusion models, to\nachieve realistic image restoration. Specifically, we introduce an injective\nmodulation sub-network -- LAControlNet for finetuning, while the pre-trained\nStable Diffusion is to maintain its generative ability. Finally, we introduce a\ncontrollable module that allows users to balance quality and fidelity by\nintroducing the latent image guidance in the denoising process during\ninference. Extensive experiments have demonstrated its superiority over\nstate-of-the-art approaches for both blind image super-resolution and blind\nface restoration tasks on synthetic and real-world datasets. The code is\navailable at https://github.com/XPixelGroup/DiffBIR.\n","authors":["Xinqi Lin","Jingwen He","Ziyan Chen","Zhaoyang Lyu","Ben Fei","Bo Dai","Wanli Ouyang","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15068v1","updated":"2023-08-29T07:00:35Z","published":"2023-08-29T07:00:35Z","title":"A Comprehensive Augmentation Framework for Anomaly Detection","summary":" Data augmentation methods are commonly integrated into the training of\nanomaly detection models. Previous approaches have primarily focused on\nreplicating real-world anomalies or enhancing diversity, without considering\nthat the standard of anomaly varies across different classes, potentially\nleading to a biased training distribution.This paper analyzes crucial traits of\nsimulated anomalies that contribute to the training of reconstructive networks\nand condenses them into several methods, thus creating a comprehensive\nframework by selectively utilizing appropriate combinations.Furthermore, we\nintegrate this framework with a reconstruction-based approach and concurrently\npropose a split training strategy that alleviates the issue of overfitting\nwhile avoiding introducing interference to the reconstruction process. The\nevaluations conducted on the MVTec anomaly detection dataset demonstrate that\nour method outperforms the previous state-of-the-art approach, particularly in\nterms of object classes.To evaluate generalizability, we generate a simulated\ndataset comprising anomalies with diverse characteristics since the original\ntest samples only include specific types of anomalies and may lead to biased\nevaluations. Experimental results demonstrate that our approach exhibits\npromising potential for generalizing effectively to various unforeseen\nanomalies encountered in real-world scenarios.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15063v1","updated":"2023-08-29T06:55:42Z","published":"2023-08-29T06:55:42Z","title":"Learning Cross-modality Information Bottleneck Representation for\n Heterogeneous Person Re-Identification","summary":" Visible-Infrared person re-identification (VI-ReID) is an important and\nchallenging task in intelligent video surveillance. Existing methods mainly\nfocus on learning a shared feature space to reduce the modality discrepancy\nbetween visible and infrared modalities, which still leave two problems\nunderexplored: information redundancy and modality complementarity. To this\nend, properly eliminating the identity-irrelevant information as well as making\nup for the modality-specific information are critical and remains a challenging\nendeavor. To tackle the above problems, we present a novel mutual information\nand modality consensus network, namely CMInfoNet, to extract modality-invariant\nidentity features with the most representative information and reduce the\nredundancies. The key insight of our method is to find an optimal\nrepresentation to capture more identity-relevant information and compress the\nirrelevant parts by optimizing a mutual information bottleneck trade-off.\nBesides, we propose an automatically search strategy to find the most prominent\nparts that identify the pedestrians. To eliminate the cross- and intra-modality\nvariations, we also devise a modality consensus module to align the visible and\ninfrared modalities for task-specific guidance. Moreover, the global-local\nfeature representations can also be acquired for key parts discrimination.\nExperimental results on four benchmarks, i.e., SYSU-MM01, RegDB,\nOccluded-DukeMTMC, Occluded-REID, Partial-REID and Partial\\_iLIDS dataset, have\ndemonstrated the effectiveness of CMInfoNet.\n","authors":["Haichao Shi","Mandi Luo","Xiao-Yu Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2308.15063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15061v1","updated":"2023-08-29T06:50:04Z","published":"2023-08-29T06:50:04Z","title":"AIoT-Based Drum Transcription Robot using Convolutional Neural Networks","summary":" With the development of information technology, robot technology has made\ngreat progress in various fields. These new technologies enable robots to be\nused in industry, agriculture, education and other aspects. In this paper, we\npropose a drum robot that can automatically complete music transcription in\nreal-time, which is based on AIoT and fog computing technology. Specifically,\nthis drum robot system consists of a cloud node for data storage, edge nodes\nfor real-time computing, and data-oriented execution application nodes. In\norder to analyze drumming music and realize drum transcription, we further\npropose a light-weight convolutional neural network model to classify drums,\nwhich can be more effectively deployed in terminal devices for fast edge\ncalculations. The experimental results show that the proposed system can\nachieve more competitive performance and enjoy a variety of smart applications\nand services.\n","authors":["Yukun Su","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2308.15061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15056v1","updated":"2023-08-29T06:33:13Z","published":"2023-08-29T06:33:13Z","title":"A Consumer-tier based Visual-Brain Machine Interface for Augmented\n Reality Glasses Interactions","summary":" Objective.Visual-Brain Machine Interface(V-BMI) has provide a novel\ninteraction technique for Augmented Reality (AR) industries. Several\nstate-of-arts work has demonstates its high accuracy and real-time interaction\ncapbilities. However, most of the studies employ EEGs devices that are rigid\nand difficult to apply in real-life AR glasseses application sceniraros. Here\nwe develop a consumer-tier Visual-Brain Machine Inteface(V-BMI) system\nspecialized for Augmented Reality(AR) glasses interactions. Approach. The\ndeveloped system consists of a wearable hardware which takes advantages of fast\nset-up, reliable recording and comfortable wearable experience that\nspecificized for AR glasses applications. Complementing this hardware, we have\ndevised a software framework that facilitates real-time interactions within the\nsystem while accommodating a modular configuration to enhance scalability. Main\nresults. The developed hardware is only 110g and 120x85x23 mm, which with 1\nTohm and peak to peak voltage is less than 1.5 uV, and a V-BMI based angry bird\ngame and an Internet of Thing (IoT) AR applications are deisgned, we\ndemonstrated such technology merits of intuitive experience and efficiency\ninteraction. The real-time interaction accuracy is between 85 and 96\npercentages in a commercial AR glasses (DTI is 2.24s and ITR 65 bits-min ).\nSignificance. Our study indicates the developed system can provide an essential\nhardware-software framework for consumer based V-BMI AR glasses. Also, we\nderive several pivotal design factors for a consumer-grade V-BMI-based AR\nsystem: 1) Dynamic adaptation of stimulation patterns-classification methods\nvia computer vision algorithms is necessary for AR glasses applications; and 2)\nAlgorithmic localization to foster system stability and latency reduction.\n","authors":["Yuying Jiang","Fan Bai","Zicheng Zhang","Xiaochen Ye","Zheng Liu","Zhiping Shi","Jianwei Yao","Xiaojun Liu","Fangkun Zhu","Junling Li Qian Guo","Xiaoan Wang","Junwen Luo"],"pdf_url":"https://arxiv.org/pdf/2308.15056v1.pdf","comment":"15 pages,10 figures"},{"id":"http://arxiv.org/abs/2303.10452v2","updated":"2023-08-29T06:32:06Z","published":"2023-03-18T16:40:10Z","title":"Confidence Attention and Generalization Enhanced Distillation for\n Continuous Video Domain Adaptation","summary":" Continuous Video Domain Adaptation (CVDA) is a scenario where a source model\nis required to adapt to a series of individually available changing target\ndomains continuously without source data or target supervision. It has wide\napplications, such as robotic vision and autonomous driving. The main\nunderlying challenge of CVDA is to learn helpful information only from the\nunsupervised target data while avoiding forgetting previously learned knowledge\ncatastrophically, which is out of the capability of previous Video-based\nUnsupervised Domain Adaptation methods. Therefore, we propose a\nConfidence-Attentive network with geneRalization enhanced self-knowledge\ndisTillation (CART) to address the challenge in CVDA. Firstly, to learn from\nunsupervised domains, we propose to learn from pseudo labels. However, in\ncontinuous adaptation, prediction errors can accumulate rapidly in pseudo\nlabels, and CART effectively tackles this problem with two key modules.\nSpecifically, The first module generates refined pseudo labels using model\npredictions and deploys a novel attentive learning strategy. The second module\ncompares the outputs of augmented data from the current model to the outputs of\nweakly augmented data from the source model, forming a novel consistency\nregularization on the model to alleviate the accumulation of prediction errors.\nExtensive experiments suggest that the CVDA performance of CART outperforms\nexisting methods by a considerable margin.\n","authors":["Xiyu Wang","Yuecong Xu","Jianfei Yang","Bihan Wen","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2303.10452v2.pdf","comment":"16 pages, 9 tables, 10 figures"},{"id":"http://arxiv.org/abs/2308.15050v1","updated":"2023-08-29T06:20:36Z","published":"2023-08-29T06:20:36Z","title":"iBARLE: imBalance-Aware Room Layout Estimation","summary":" Room layout estimation predicts layouts from a single panorama. It requires\ndatasets with large-scale and diverse room shapes to train the models. However,\nthere are significant imbalances in real-world datasets including the\ndimensions of layout complexity, camera locations, and variation in scene\nappearance. These issues considerably influence the model training performance.\nIn this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE)\nframework to address these issues. iBARLE consists of (1) Appearance Variation\nGeneration (AVG) module, which promotes visual appearance domain\ngeneralization, (2) Complex Structure Mix-up (CSMix) module, which enhances\ngeneralizability w.r.t. room structure, and (3) a gradient-based layout\nobjective function, which allows more effective accounting for occlusions in\ncomplex layouts. All modules are jointly trained and help each other to achieve\nthe best performance. Experiments and ablation studies based on\nZInD~\\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art\nperformance compared with other layout estimation baselines.\n","authors":["Taotao Jing","Lichen Wang","Naji Khosravan","Zhiqiang Wan","Zachary Bessinger","Zhengming Ding","Sing Bing Kang"],"pdf_url":"https://arxiv.org/pdf/2308.15050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15049v1","updated":"2023-08-29T06:14:06Z","published":"2023-08-29T06:14:06Z","title":"Pose-Free Neural Radiance Fields via Implicit Pose Regularization","summary":" Pose-free neural radiance fields (NeRF) aim to train NeRF with unposed\nmulti-view images and it has achieved very impressive success in recent years.\nMost existing works share the pipeline of training a coarse pose estimator with\nrendered images at first, followed by a joint optimization of estimated poses\nand neural radiance field. However, as the pose estimator is trained with only\nrendered images, the pose estimation is usually biased or inaccurate for real\nimages due to the domain gap between real images and rendered images, leading\nto poor robustness for the pose estimation of real images and further local\nminima in joint optimization. We design IR-NeRF, an innovative pose-free NeRF\nthat introduces implicit pose regularization to refine pose estimator with\nunposed real images and improve the robustness of the pose estimation for real\nimages. With a collection of 2D images of a specific scene, IR-NeRF constructs\na scene codebook that stores scene features and captures the scene-specific\npose distribution implicitly as priors. Thus, the robustness of pose estimation\ncan be promoted with the scene priors according to the rationale that a 2D real\nimage can be well reconstructed from the scene codebook only when its estimated\npose lies within the pose distribution. Extensive experiments show that IR-NeRF\nachieves superior novel view synthesis and outperforms the state-of-the-art\nconsistently across multiple synthetic and real datasets.\n","authors":["Jiahui Zhang","Fangneng Zhan","Yingchen Yu","Kunhao Liu","Rongliang Wu","Xiaoqin Zhang","Ling Shao","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2308.15049v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.09121v2","updated":"2023-08-29T06:11:58Z","published":"2023-05-16T03:00:04Z","title":"A Conditional Denoising Diffusion Probabilistic Model for Radio\n Interferometric Image Reconstruction","summary":" In radio astronomy, signals from radio telescopes are transformed into images\nof observed celestial objects, or sources. However, these images, called dirty\nimages, contain real sources as well as artifacts due to signal sparsity and\nother factors. Therefore, radio interferometric image reconstruction is\nperformed on dirty images, aiming to produce clean images in which artifacts\nare reduced and real sources are recovered. So far, existing methods have\nlimited success on recovering faint sources, preserving detailed structures,\nand eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and\nImage Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to\nuse both the original visibility data in the spectral domain and dirty images\nin the spatial domain to guide the image generation process with DDPM. This\nway, we can leverage DDPM to generate fine details and eliminate noise, while\nutilizing visibility data to separate signals from noise and retaining spatial\ninformation in dirty images. We have conducted experiments in comparison with\nboth traditional methods and recent deep learning based approaches. Our results\nshow that our method significantly improves the resulting images by reducing\nartifacts, preserving fine details, and recovering dim sources. This\nadvancement further facilitates radio astronomical data analysis tasks on\ncelestial phenomena.\n","authors":["Ruoqi Wang","Zhuoyang Chen","Qiong Luo","Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2305.09121v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.15037v1","updated":"2023-08-29T05:44:00Z","published":"2023-08-29T05:44:00Z","title":"Is it an i or an l: Test-time Adaptation of Text Line Recognition Models","summary":" Recognizing text lines from images is a challenging problem, especially for\nhandwritten documents due to large variations in writing styles. While text\nline recognition models are generally trained on large corpora of real and\nsynthetic data, such models can still make frequent mistakes if the handwriting\nis inscrutable or the image acquisition process adds corruptions, such as\nnoise, blur, compression, etc. Writing style is generally quite consistent for\nan individual, which can be leveraged to correct mistakes made by such models.\nMotivated by this, we introduce the problem of adapting text line recognition\nmodels during test time. We focus on a challenging and realistic setting where,\ngiven only a single test image consisting of multiple text lines, the task is\nto adapt the model such that it performs better on the image, without any\nlabels. We propose an iterative self-training approach that uses feedback from\nthe language model to update the optical model, with confident self-labels in\neach iteration. The confidence measure is based on an augmentation mechanism\nthat evaluates the divergence of the prediction of the model in a local region.\nWe perform rigorous evaluation of our method on several benchmark datasets as\nwell as their corrupted versions. Experimental results on multiple datasets\nspanning multiple scripts show that the proposed adaptation method offers an\nabsolute improvement of up to 8% in character error rate with just a few\niterations of self-training at test time.\n","authors":["Debapriya Tula","Sujoy Paul","Gagan Madan","Peter Garst","Reeve Ingle","Gaurav Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2308.15037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10875v2","updated":"2023-08-29T05:42:49Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":" With the growth of 3D sensing technology, deep learning system for 3D point\nclouds has become increasingly important, especially in applications like\nautonomous vehicles where safety is a primary concern. However, there are also\ngrowing concerns about the reliability of these systems when they encounter\nnoisy point clouds, whether occurring naturally or introduced with malicious\nintent. This paper highlights the challenges of point cloud classification\nposed by various forms of noise, from simple background noise to malicious\nbackdoor attacks that can intentionally skew model predictions. While there's\nan urgent need for optimized point cloud denoising, current point outlier\nremoval approaches, an essential step for denoising, rely heavily on\nhandcrafted strategies and are not adapted for higher-level tasks, such as\nclassification. To address this issue, we introduce an innovative point outlier\ncleansing method that harnesses the power of downstream classification models.\nBy employing gradient-based attribution analysis, we define a novel concept:\npoint risk. Drawing inspiration from tail risk minimization in finance, we\nrecast the outlier removal process as an optimization problem, named PointCVaR.\nExtensive experiments show that our proposed technique not only robustly\nfilters diverse point cloud outliers but also consistently and significantly\nenhances existing robust methods for point cloud classification.\n","authors":["Xinke Li","Junchi Lu","Henghui Ding","Changsheng Sun","Joey Tianyi Zhou","Chee Yeow Meng"],"pdf_url":"https://arxiv.org/pdf/2307.10875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05021v4","updated":"2023-08-29T05:20:36Z","published":"2023-03-09T03:48:24Z","title":"DiffusionDepth: Diffusion Denoising Approach for Monocular Depth\n Estimation","summary":" Monocular depth estimation is a challenging task that predicts the pixel-wise\ndepth from a single 2D image. Current methods typically model this problem as a\nregression or classification task. We propose DiffusionDepth, a new approach\nthat reformulates monocular depth estimation as a denoising diffusion process.\nIt learns an iterative denoising process to `denoise' random depth distribution\ninto a depth map with the guidance of monocular visual conditions. The process\nis performed in the latent space encoded by a dedicated depth encoder and\ndecoder. Instead of diffusing ground truth (GT) depth, the model learns to\nreverse the process of diffusing the refined depth of itself into random depth\ndistribution. This self-diffusion formulation overcomes the difficulty of\napplying generative models to sparse GT depth scenarios. The proposed approach\nbenefits this task by refining depth estimation step by step, which is superior\nfor generating accurate and highly detailed depth maps. Experimental results on\nKITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion\napproach could reach state-of-the-art performance in both indoor and outdoor\nscenarios with acceptable inference time.\n","authors":["Yiqun Duan","Xianda Guo","Zheng Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.05021v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14066v2","updated":"2023-08-29T04:59:41Z","published":"2023-08-27T10:39:33Z","title":"Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential\n Generative Adversarial Networks","summary":" In this paper, we propose a bi-modality medical image synthesis approach\nbased on sequential generative adversarial network (GAN) and semi-supervised\nlearning. Our approach consists of two generative modules that synthesize\nimages of the two modalities in a sequential order. A method for measuring the\nsynthesis complexity is proposed to automatically determine the synthesis order\nin our sequential GAN. Images of the modality with a lower complexity are\nsynthesized first, and the counterparts with a higher complexity are generated\nlater. Our sequential GAN is trained end-to-end in a semi-supervised manner. In\nsupervised training, the joint distribution of bi-modality images are learned\nfrom real paired images of the two modalities by explicitly minimizing the\nreconstruction losses between the real and synthetic images. To avoid\noverfitting limited training images, in unsupervised training, the marginal\ndistribution of each modality is learned based on unpaired images by minimizing\nthe Wasserstein distance between the distributions of real and fake images. We\ncomprehensively evaluate the proposed model using two synthesis tasks based on\nthree types of evaluate metrics and user studies. Visual and quantitative\nresults demonstrate the superiority of our method to the state-of-the-art\nmethods, and reasonable visual quality and clinical significance. Code is made\npublicly available at\nhttps://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis.\n","authors":["Xin Yang","Yi Lin","Zhiwei Wang","Xin Li","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.14066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15019v1","updated":"2023-08-29T04:46:52Z","published":"2023-08-29T04:46:52Z","title":"Pyramid diffractive optical networks for unidirectional magnification\n and demagnification","summary":" Diffractive deep neural networks (D2NNs) are composed of successive\ntransmissive layers optimized using supervised deep learning to all-optically\nimplement various computational tasks between an input and output field-of-view\n(FOV). Here, we present a pyramid-structured diffractive optical network design\n(which we term P-D2NN), optimized specifically for unidirectional image\nmagnification and demagnification. In this P-D2NN design, the diffractive\nlayers are pyramidally scaled in alignment with the direction of the image\nmagnification or demagnification. Our analyses revealed the efficacy of this\nP-D2NN design in unidirectional image magnification and demagnification tasks,\nproducing high-fidelity magnified or demagnified images in only one direction,\nwhile inhibiting the image formation in the opposite direction - confirming the\ndesired unidirectional imaging operation. Compared to the conventional D2NN\ndesigns with uniform-sized successive diffractive layers, P-D2NN design\nachieves similar performance in unidirectional magnification tasks using only\nhalf of the diffractive degrees of freedom within the optical processor volume.\nFurthermore, it maintains its unidirectional image\nmagnification/demagnification functionality across a large band of illumination\nwavelengths despite being trained with a single illumination wavelength. With\nthis pyramidal architecture, we also designed a wavelength-multiplexed\ndiffractive network, where a unidirectional magnifier and a unidirectional\ndemagnifier operate simultaneously in opposite directions, at two distinct\nillumination wavelengths. The efficacy of the P-D2NN architecture was also\nvalidated experimentally using monochromatic terahertz illumination,\nsuccessfully matching our numerical simulations. P-D2NN offers a\nphysics-inspired strategy for designing task-specific visual processors.\n","authors":["Bijie Bai","Xilin Yang","Tianyi Gan","Jingxi Li","Deniz Mengu","Mona Jarrahi","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2308.15019v1.pdf","comment":"26 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2308.15016v1","updated":"2023-08-29T04:39:07Z","published":"2023-08-29T04:39:07Z","title":"C2G2: Controllable Co-speech Gesture Generation with Latent Diffusion\n Model","summary":" Co-speech gesture generation is crucial for automatic digital avatar\nanimation. However, existing methods suffer from issues such as unstable\ntraining and temporal inconsistency, particularly in generating high-fidelity\nand comprehensive gestures. Additionally, these methods lack effective control\nover speaker identity and temporal editing of the generated gestures. Focusing\non capturing temporal latent information and applying practical controlling, we\npropose a Controllable Co-speech Gesture Generation framework, named C2G2.\nSpecifically, we propose a two-stage temporal dependency enhancement strategy\nmotivated by latent diffusion models. We further introduce two key features to\nC2G2, namely a speaker-specific decoder to generate speaker-related real-length\nskeletons and a repainting strategy for flexible gesture generation/editing.\nExtensive experiments on benchmark gesture datasets verify the effectiveness of\nour proposed C2G2 compared with several state-of-the-art baselines. The link of\nthe project demo page can be found at https://c2g2-gesture.github.io/c2_gesture\n","authors":["Longbin Ji","Pengfei Wei","Yi Ren","Jinglin Liu","Chen Zhang","Xiang Yin"],"pdf_url":"https://arxiv.org/pdf/2308.15016v1.pdf","comment":"12 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2307.11058v2","updated":"2023-08-29T04:10:18Z","published":"2023-07-20T17:38:55Z","title":"Anticipating Driving Behavior through Deep Learning-Based Policy\n Prediction","summary":" In this endeavor, we developed a comprehensive system that processes\nintegrated visual features derived from video frames captured by a regular\ncamera, along with depth details obtained from a point cloud scanner. This\nsystem is designed to anticipate driving actions, encompassing both vehicle\nspeed and steering angle. To ensure its reliability, we conducted assessments\nwhere we juxtaposed the projected outcomes with the established norms adhered\nto by skilled real-world drivers. Our evaluation outcomes indicate that the\nforecasts achieve a noteworthy level of accuracy in a minimum of half the test\nscenarios (ranging around 50-80%, contingent on the specific model). Notably,\nthe utilization of amalgamated features yielded superior performance in\ncomparison to using video frames in isolation, as demonstrated by most of the\ncases.\n","authors":["Fuxiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.11058v2.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.15005v1","updated":"2023-08-29T03:54:26Z","published":"2023-08-29T03:54:26Z","title":"Few-Shot Object Detection via Synthetic Features with Optimal Transport","summary":" Few-shot object detection aims to simultaneously localize and classify the\nobjects in an image with limited training samples. However, most existing\nfew-shot object detection methods focus on extracting the features of a few\nsamples of novel classes that lack diversity. Hence, they may not be sufficient\nto capture the data distribution. To address that limitation, in this paper, we\npropose a novel approach in which we train a generator to generate synthetic\ndata for novel classes. Still, directly training a generator on the novel class\nis not effective due to the lack of novel data. To overcome that issue, we\nleverage the large-scale dataset of base classes. Our overarching goal is to\ntrain a generator that captures the data variations of the base dataset. We\nthen transform the captured variations into novel classes by generating\nsynthetic data with the trained generator. To encourage the generator to\ncapture data variations on base classes, we propose to train the generator with\nan optimal transport loss that minimizes the optimal transport distance between\nthe distributions of real and synthetic data. Extensive experiments on two\nbenchmark datasets demonstrate that the proposed method outperforms the state\nof the art. Source code will be available.\n","authors":["Anh-Khoa Nguyen Vu","Thanh-Toan Do","Vinh-Tiep Nguyen","Tam Le","Minh-Triet Tran","Tam V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.15005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15004v1","updated":"2023-08-29T03:41:27Z","published":"2023-08-29T03:41:27Z","title":"PBFormer: Capturing Complex Scene Text Shape with Polynomial Band\n Transformer","summary":" We present PBFormer, an efficient yet powerful scene text detector that\nunifies the transformer with a novel text shape representation Polynomial Band\n(PB). The representation has four polynomial curves to fit a text's top,\nbottom, left, and right sides, which can capture a text with a complex shape by\nvarying polynomial coefficients. PB has appealing features compared with\nconventional representations: 1) It can model different curvatures with a fixed\nnumber of parameters, while polygon-points-based methods need to utilize a\ndifferent number of points. 2) It can distinguish adjacent or overlapping texts\nas they have apparent different curve coefficients, while segmentation-based or\npoints-based methods suffer from adhesive spatial positions. PBFormer combines\nthe PB with the transformer, which can directly generate smooth text contours\nsampled from predicted curves without interpolation. A parameter-free\ncross-scale pixel attention (CPA) module is employed to highlight the feature\nmap of a suitable scale while suppressing the other feature maps. The simple\noperation can help detect small-scale texts and is compatible with the\none-stage DETR framework, where no postprocessing exists for NMS. Furthermore,\nPBFormer is trained with a shape-contained loss, which not only enforces the\npiecewise alignment between the ground truth and the predicted curves but also\nmakes curves' positions and shapes consistent with each other. Without bells\nand whistles about text pre-training, our method is superior to the previous\nstate-of-the-art text detectors on the arbitrary-shaped text datasets.\n","authors":["Ruijin Liu","Ning Lu","Dapeng Chen","Cheng Li","Zejian Yuan","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2308.15004v1.pdf","comment":"9 pages, 8 figures, accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.00290v2","updated":"2023-08-29T03:31:58Z","published":"2023-07-01T10:12:46Z","title":"All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with\n Prompt-based Finetuning","summary":" The Segment Anything Model (SAM) is a recently proposed prompt-based\nsegmentation model in a generic zero-shot segmentation approach. With the\nzero-shot segmentation capacity, SAM achieved impressive flexibility and\nprecision on various segmentation tasks. However, the current pipeline requires\nmanual prompts during the inference stage, which is still resource intensive\nfor biomedical image segmentation. In this paper, instead of using prompts\nduring the inference stage, we introduce a pipeline that utilizes the SAM,\ncalled all-in-SAM, through the entire AI development workflow (from annotation\ngeneration to model finetuning) without requiring manual prompts during the\ninference stage. Specifically, SAM is first employed to generate pixel-level\nannotations from weak prompts (e.g., points, bounding box). Then, the\npixel-level annotations are used to finetune the SAM segmentation model rather\nthan training from scratch. Our experimental results reveal two key findings:\n1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a\nnuclei segmentation task on the public Monuseg dataset, and 2) the utilization\nof weak and few annotations for SAM finetuning achieves competitive performance\ncompared to using strong pixel-wise annotated data.\n","authors":["Can Cui","Ruining Deng","Quan Liu","Tianyuan Yao","Shunxing Bao","Lucas W. Remedios","Yucheng Tang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2307.00290v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14995v1","updated":"2023-08-29T02:50:36Z","published":"2023-08-29T02:50:36Z","title":"WSAM: Visual Explanations from Style Augmentation as Adversarial\n Attacker and Their Influence in Image Classification","summary":" Currently, style augmentation is capturing attention due to convolutional\nneural networks (CNN) being strongly biased toward recognizing textures rather\nthan shapes. Most existing styling methods either perform a low-fidelity style\ntransfer or a weak style representation in the embedding vector. This paper\noutlines a style augmentation algorithm using stochastic-based sampling with\nnoise addition to improving randomization on a general linear transformation\nfor style transfer. With our augmentation strategy, all models not only present\nincredible robustness against image stylizing but also outperform all previous\nmethods and surpass the state-of-the-art performance for the STL-10 dataset. In\naddition, we present an analysis of the model interpretations under different\nstyle variations. At the same time, we compare comprehensive experiments\ndemonstrating the performance when applied to deep neural architectures in\ntraining settings.\n","authors":["Felipe Moreno-Vera","Edgar Medina","Jorge Poco"],"pdf_url":"https://arxiv.org/pdf/2308.14995v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.14221v2","updated":"2023-08-29T02:50:25Z","published":"2023-08-27T22:45:24Z","title":"High-Resolution Document Shadow Removal via A Large-Scale Real-World\n Dataset and A Frequency-Aware Shadow Erasing Net","summary":" Shadows often occur when we capture the documents with casual equipment,\nwhich influences the visual quality and readability of the digital copies.\nDifferent from the algorithms for natural shadow removal, the algorithms in\ndocument shadow removal need to preserve the details of fonts and figures in\nhigh-resolution input. Previous works ignore this problem and remove the\nshadows via approximate attention and small datasets, which might not work in\nreal-world situations. We handle high-resolution document shadow removal\ndirectly via a larger-scale real-world dataset and a carefully designed\nfrequency-aware network. As for the dataset, we acquire over 7k couples of\nhigh-resolution (2462 x 3699) images of real-world document pairs with various\nsamples under different lighting circumstances, which is 10 times larger than\nexisting datasets. As for the design of the network, we decouple the\nhigh-resolution images in the frequency domain, where the low-frequency details\nand high-frequency boundaries can be effectively learned via the carefully\ndesigned network structure. Powered by our network and dataset, the proposed\nmethod clearly shows a better performance than previous methods in terms of\nvisual quality and numerical results. The code, models, and dataset are\navailable at: https://github.com/CXH-Research/DocShadow-SD7K\n","authors":["Zinuo Li","Xuhang Chen","Chi-Man Pun","Xiaodong Cun"],"pdf_url":"https://arxiv.org/pdf/2308.14221v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14994v1","updated":"2023-08-29T02:49:16Z","published":"2023-08-29T02:49:16Z","title":"ICARUS: An Android-Based Unmanned Aerial Vehicle (UAV) Search and Rescue\n Eye in the Sky","summary":" The purpose of this paper is to develop an unmanned aerial vehicle (UAV)\nusing a quadcopter with the capability of video surveillance, map coordinates,\na deployable parachute with a medicine kit or a food pack as a payload, a\ncollision warning system, remotely controlled, integrated with an android\napplication to assist in search and rescue operations.\n Applied research for the development of the functional prototype,\nquantitative and descriptive statistics to summarize data by describing the\nrelationship between variables in a sample or population. The quadcopter\nunderwent an evaluation using a survey instrument to test its acceptability\nusing predefined variables to select respondents within Caloocan City and\nQuezon City, Philippines.\n Demographic profiles and known issues and concerns were answered by 30\nrespondents. The results were summarized and distributed in Tables 1 and 2.\n In terms of demographic profiles, the number of SAR operators within the\nspecified areas is distributed equally, most are male, single, and within the\nage bracket of 31 and above. In issues and concerns, the most common type of\nsearch and rescue was ground search and rescue. Human error is the primary\ncause of most injuries in operating units. The prototype was useful and\neveryone agreed, in terms of acceptability, drone technology will improve\nsearch and rescue operations.\n The innovative way of utilizing Android and drone technology is a new step\ntowards the improvement of SAR operations in the Philippines.\n The LiPo battery must be replaced with a higher capacity and the drone\noperator should undergo a training course and secure a permit from the Civil\nAviation Authority of the Philippines (CAAP).\n","authors":["Manuel Luis C. Delos Santos","Jerum B. Dasalla","Jomar C. Feliciano","Dustin Red B. Cabatay"],"pdf_url":"https://arxiv.org/pdf/2308.14994v1.pdf","comment":"15 pages, 14 figures, Special Issue: IRCCETE 2023"},{"id":"http://arxiv.org/abs/2210.00429v2","updated":"2023-08-29T02:32:22Z","published":"2022-10-02T05:34:19Z","title":"ROSIA: Rotation-Search-Based Star Identification Algorithm","summary":" This paper presents a rotation-search-based approach for addressing the star\nidentification (Star-ID) problem. The proposed algorithm, ROSIA, is a\nheuristics-free algorithm that seeks the optimal rotation that maximally aligns\nthe input and catalog stars in their respective coordinates. ROSIA searches the\nrotation space systematically with the Branch-and-Bound (BnB) method. Crucially\naffecting the runtime feasibility of ROSIA is the upper bound function that\nprioritizes the search space. In this paper, we make a theoretical contribution\nby proposing a tight (provable) upper bound function that enables a 400x\nspeed-up compared to an existing formulation. Coupling the bounding function\nwith an efficient evaluation scheme that leverages stereographic projection and\nthe R-tree data structure, ROSIA achieves feasible operational speed on\nembedded processors with state-of-the-art performances under different sources\nof noise. The source code of ROSIA is available at\nhttps://github.com/ckchng/ROSIA.\n","authors":["Chee-Kheng Chng","Alvaro Parra Bustos","Benjamin McCarthy","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2210.00429v2.pdf","comment":"21 pages, 16 figures, Accepted to IEEE Transactions on Aerospace and\n Electronic Systems"},{"id":"http://arxiv.org/abs/2308.14978v1","updated":"2023-08-29T02:09:56Z","published":"2023-08-29T02:09:56Z","title":"Vision Grid Transformer for Document Layout Analysis","summary":" Document pre-trained models and grid-based models have proven to be very\neffective on various tasks in Document AI. However, for the document layout\nanalysis (DLA) task, existing document pre-trained models, even those\npre-trained in a multi-modal fashion, usually rely on either textual features\nor visual features. Grid-based models for DLA are multi-modality but largely\nneglect the effect of pre-training. To fully leverage multi-modal information\nand exploit pre-training techniques to learn better representation for DLA, in\nthis paper, we present VGT, a two-stream Vision Grid Transformer, in which Grid\nTransformer (GiT) is proposed and pre-trained for 2D token-level and\nsegment-level semantic understanding. Furthermore, a new dataset named D$^4$LA,\nwhich is so far the most diverse and detailed manually-annotated benchmark for\ndocument layout analysis, is curated and released. Experiment results have\nillustrated that the proposed VGT model achieves new state-of-the-art results\non DLA tasks, e.g. PubLayNet ($95.7\\%$$\\rightarrow$$96.2\\%$), DocBank\n($79.6\\%$$\\rightarrow$$84.1\\%$), and D$^4$LA ($67.7\\%$$\\rightarrow$$68.8\\%$).\nThe code and models as well as the D$^4$LA dataset will be made publicly\navailable ~\\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery}.\n","authors":["Cheng Da","Chuwei Luo","Qi Zheng","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2308.14978v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03998v2","updated":"2023-08-29T01:57:19Z","published":"2023-08-08T02:28:48Z","title":"Real-time Strawberry Detection Based on Improved YOLOv5s Architecture\n for Robotic Harvesting in open-field environment","summary":" This study proposed a YOLOv5-based custom object detection model to detect\nstrawberries in an outdoor environment. The original architecture of the\nYOLOv5s was modified by replacing the C3 module with the C2f module in the\nbackbone network, which provided a better feature gradient flow. Secondly, the\nSpatial Pyramid Pooling Fast in the final layer of the backbone network of\nYOLOv5s was combined with Cross Stage Partial Net to improve the generalization\nability over the strawberry dataset in this study. The proposed architecture\nwas named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with\nthree maturity classes (immature, nearly mature, and mature) was collected in\nopen-field environment and augmented through a series of operations including\nbrightness reduction, brightness increase, and noise adding. To verify the\nsuperiority of the proposed method for strawberry detection in open-field\nenvironment, four competitive detection models (YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational\nenvironment and compared with YOLOv5s-Straw. The results showed that the\nhighest mean average precision of 80.3% was achieved using the proposed\narchitecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.\nSpecifically, the average precision of YOLOv5s-Straw was 82.1% in the immature\nclass, 73.5% in the nearly mature class, and 86.6% in the mature class, which\nwere 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The\nmodel included 8.6*10^6 network parameters with an inference speed of 18ms per\nimage while the inference speed of YOLOv8s had a slower inference speed of\n21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed\nmodel is fast enough for real time strawberry detection and localization for\nthe robotic picking.\n","authors":["Zixuan He","Salik Ram Khana","Xin Zhang","Manoj Karkee","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03998v2.pdf","comment":"20 pages; 15 figures"},{"id":"http://arxiv.org/abs/2308.14969v1","updated":"2023-08-29T01:47:49Z","published":"2023-08-29T01:47:49Z","title":"Reprogramming under constraints: Revisiting efficient and reliable\n transferability of lottery tickets","summary":" In the era of foundation models with huge pre-training budgets, the\ndownstream tasks have been shifted to the narrative of efficient and fast\nadaptation. For classification-based tasks in the domain of computer vision,\nthe two most efficient approaches have been linear probing (LP) and visual\nprompting/reprogramming (VP); the former aims to learn a classifier in the form\nof a linear head on the features extracted by the pre-trained model, while the\nlatter maps the input data to the domain of the source data on which the model\nwas originally pre-trained on. Although extensive studies have demonstrated the\ndifferences between LP and VP in terms of downstream performance, we explore\nthe capabilities of the two aforementioned methods via the sparsity axis: (a)\nData sparsity: the impact of few-shot adaptation and (b) Model sparsity: the\nimpact of lottery tickets (LT). We demonstrate that LT are not universal\nreprogrammers, i.e., for certain target datasets, reprogramming an LT yields\nsignificantly lower performance than the reprogrammed dense model although\ntheir corresponding upstream performance is similar. Further, we demonstrate\nthat the calibration of dense models is always superior to that of their\nlottery ticket counterparts under both LP and VP regimes. Our empirical study\nopens a new avenue of research into VP for sparse models and encourages further\nunderstanding of the performance beyond the accuracy achieved by VP under\nconstraints of sparsity. Code and logs can be accessed at\n\\url{https://github.com/landskape-ai/Reprogram_LT}.\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.14965v1","updated":"2023-08-29T01:34:33Z","published":"2023-08-29T01:34:33Z","title":"CEFHRI: A Communication Efficient Federated Learning Framework for\n Recognizing Industrial Human-Robot Interaction","summary":" Human-robot interaction (HRI) is a rapidly growing field that encompasses\nsocial and industrial applications. Machine learning plays a vital role in\nindustrial HRI by enhancing the adaptability and autonomy of robots in complex\nenvironments. However, data privacy is a crucial concern in the interaction\nbetween humans and robots, as companies need to protect sensitive data while\nmachine learning algorithms require access to large datasets. Federated\nLearning (FL) offers a solution by enabling the distributed training of models\nwithout sharing raw data. Despite extensive research on Federated learning (FL)\nfor tasks such as natural language processing (NLP) and image classification,\nthe question of how to use FL for HRI remains an open research problem. The\ntraditional FL approach involves transmitting large neural network parameter\nmatrices between the server and clients, which can lead to high communication\ncosts and often becomes a bottleneck in FL. This paper proposes a\ncommunication-efficient FL framework for human-robot interaction (CEFHRI) to\naddress the challenges of data heterogeneity and communication costs. The\nframework leverages pre-trained models and introduces a trainable\nspatiotemporal adapter for video understanding tasks in HRI. Experimental\nresults on three human-robot interaction benchmark datasets: HRI30, InHARD, and\nCOIN demonstrate the superiority of CEFHRI over full fine-tuning in terms of\ncommunication costs. The proposed methodology provides a secure and efficient\napproach to HRI federated learning, particularly in industrial environments\nwith data privacy concerns and limited communication bandwidth. Our code is\navailable at\nhttps://github.com/umarkhalidAI/CEFHRI-Efficient-Federated-Learning.\n","authors":["Umar Khalid","Hasan Iqbal","Saeed Vahidian","Jing Hua","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14965v1.pdf","comment":"Accepted in IROS 2023"},{"id":"http://arxiv.org/abs/2308.14960v1","updated":"2023-08-29T01:22:30Z","published":"2023-08-29T01:22:30Z","title":"Read-only Prompt Optimization for Vision-Language Few-shot Learning","summary":" In recent years, prompt tuning has proven effective in adapting pre-trained\nvision-language models to downstream tasks. These methods aim to adapt the\npre-trained models by introducing learnable prompts while keeping pre-trained\nweights frozen. However, learnable prompts can affect the internal\nrepresentation within the self-attention module, which may negatively impact\nperformance variance and generalization, especially in data-deficient settings.\nTo address these issues, we propose a novel approach, Read-only Prompt\nOptimization (RPO). RPO leverages masked attention to prevent the internal\nrepresentation shift in the pre-trained model. Further, to facilitate the\noptimization of RPO, the read-only prompts are initialized based on special\ntokens of the pre-trained model. Our extensive experiments demonstrate that RPO\noutperforms CLIP and CoCoOp in base-to-new generalization and domain\ngeneralization while displaying better robustness. Also, the proposed method\nachieves better generalization on extremely data-deficient settings, while\nimproving parameter efficiency and computational overhead. Code is available at\nhttps://github.com/mlvlab/RPO.\n","authors":["Dongjun Lee","Seokwon Song","Jihee Suh","Joonmyeong Choi","Sanghyeok Lee","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14960v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2306.03454v2","updated":"2023-08-29T01:20:04Z","published":"2023-06-06T07:17:56Z","title":"Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems:\n Challenges and Opportunities","summary":" Multi-Sensor Fusion (MSF) based perception systems have been the foundation\nin supporting many industrial applications and domains, such as self-driving\ncars, robotic arms, and unmanned aerial vehicles. Over the past few years, the\nfast progress in data-driven artificial intelligence (AI) has brought a\nfast-increasing trend to empower MSF systems by deep learning techniques to\nfurther improve performance, especially on intelligent systems and their\nperception systems. Although quite a few AI-enabled MSF perception systems and\ntechniques have been proposed, up to the present, limited benchmarks that focus\non MSF perception are publicly available. Given that many intelligent systems\nsuch as self-driving cars are operated in safety-critical contexts where\nperception systems play an important role, there comes an urgent need for a\nmore in-depth understanding of the performance and reliability of these MSF\nsystems. To bridge this gap, we initiate an early step in this direction and\nconstruct a public benchmark of AI-enabled MSF-based perception systems\nincluding three commonly adopted tasks (i.e., object detection, object\ntracking, and depth completion). Based on this, to comprehensively understand\nMSF systems' robustness and reliability, we design 14 common and realistic\ncorruption patterns to synthesize large-scale corrupted datasets. We further\nperform a systematic evaluation of these systems through our large-scale\nevaluation. Our results reveal the vulnerability of the current AI-enabled MSF\nperception systems, calling for researchers and practitioners to take\nrobustness and reliability into account when designing AI-enabled MSF.\n","authors":["Xinyu Gao","Zhijie Wang","Yang Feng","Lei Ma","Zhenyu Chen","Baowen Xu"],"pdf_url":"https://arxiv.org/pdf/2306.03454v2.pdf","comment":"To appear in ESEC/FSE 2023"},{"id":"http://arxiv.org/abs/2305.14713v2","updated":"2023-08-29T01:10:15Z","published":"2023-05-24T04:30:25Z","title":"Streaming Object Detection on Fisheye Cameras for Automatic Parking","summary":" Fisheye cameras are widely employed in automatic parking, and the video\nstream object detection (VSOD) of the fisheye camera is a fundamental\nperception function to ensure the safe operation of vehicles. In past research\nwork, the difference between the output of the deep learning model and the\nactual situation at the current moment due to the existence of delay of the\nperception system is generally ignored. But the environment will inevitably\nchange within the delay time which may cause a potential safety hazard. In this\npaper, we propose a real-time detection framework equipped with a dual-flow\nperception module (dynamic and static flows) that can predict the future and\nalleviate the time-lag problem. Meanwhile, we use a new scheme to evaluate\nlatency and accuracy. The standard bounding box is unsuitable for the object in\nfisheye camera images due to the strong radial distortion of the fisheye camera\nand the primary detection objects of parking perception are vehicles and\npedestrians, so we adopt the rotate bounding box and propose a new periodic\nangle loss function to regress the angle of the box, which is the simple and\naccurate representation method of objects. The instance segmentation ground\ntruth is used to supervise the training. Experiments demonstrate the\neffectiveness of our approach. Code is released at:\nhttps://gitee.com/hiyanyx/fisheye-streaming-perception.\n","authors":["Yixiong Yan","Liangzhu Cheng","Yongxu Li","Xinjuan Tuo"],"pdf_url":"https://arxiv.org/pdf/2305.14713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17209v2","updated":"2023-08-29T23:47:49Z","published":"2023-03-30T08:05:59Z","title":"Human from Blur: Human Pose Tracking from Blurry Images","summary":" We propose a method to estimate 3D human poses from substantially blurred\nimages. The key idea is to tackle the inverse problem of image deblurring by\nmodeling the forward problem with a 3D human model, a texture map, and a\nsequence of poses to describe human motion. The blurring process is then\nmodeled by a temporal image aggregation step. Using a differentiable renderer,\nwe can solve the inverse problem by backpropagating the pixel-wise reprojection\nerror to recover the best human motion representation that explains a single or\nmultiple input images. Since the image reconstruction loss alone is\ninsufficient, we present additional regularization terms. To the best of our\nknowledge, we present the first method to tackle this problem. Our method\nconsistently outperforms other methods on significantly blurry inputs since\nthey lack one or multiple key functionalities that our method unifies, i.e.\nimage deblurring with sub-frame accuracy and explicit 3D modeling of non-rigid\nhuman motion.\n","authors":["Yiming Zhao","Denys Rozumnyi","Jie Song","Otmar Hilliges","Marc Pollefeys","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2303.17209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15670v1","updated":"2023-08-29T23:45:54Z","published":"2023-08-29T23:45:54Z","title":"Multimodal Foundation Models For Echocardiogram Interpretation","summary":" Multimodal deep learning foundation models can learn the relationship between\nimages and text. In the context of medical imaging, mapping images to language\nconcepts reflects the clinical task of diagnostic image interpretation, however\ncurrent general-purpose foundation models do not perform well in this context\nbecause their training corpus have limited medical text and images. To address\nthis challenge and account for the range of cardiac physiology, we leverage\n1,032,975 cardiac ultrasound videos and corresponding expert interpretations to\ndevelop EchoCLIP, a multimodal foundation model for echocardiography. EchoCLIP\ndisplays strong zero-shot (not explicitly trained) performance in cardiac\nfunction assessment (external validation left ventricular ejection fraction\nmean absolute error (MAE) of 7.1%) and identification of implanted intracardiac\ndevices (areas under the curve (AUC) between 0.84 and 0.98 for pacemakers and\nartificial heart valves). We also developed a long-context variant (EchoCLIP-R)\nwith a custom echocardiography report text tokenizer which can accurately\nidentify unique patients across multiple videos (AUC of 0.86), identify\nclinical changes such as orthotopic heart transplants (AUC of 0.79) or cardiac\nsurgery (AUC 0.77), and enable robust image-to-text search (mean cross-modal\nretrieval rank in the top 1% of candidate text reports). These emergent\ncapabilities can be used for preliminary assessment and summarization of\nechocardiographic findings.\n","authors":["Matthew Christensen","Milos Vukadinovic","Neal Yuan","David Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.15670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15667v1","updated":"2023-08-29T23:35:36Z","published":"2023-08-29T23:35:36Z","title":"Bridging Distribution Learning and Image Clustering in High-dimensional\n Space","summary":" Distribution learning focuses on learning the probability density function\nfrom a set of data samples. In contrast, clustering aims to group similar\nobjects together in an unsupervised manner. Usually, these two tasks are\nconsidered unrelated. However, the relationship between the two may be\nindirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge.\nIn this paper, we focus on exploring the correlation between distribution\nlearning and clustering, with the motivation to fill the gap between these two\nfields, utilizing an autoencoder (AE) to encode images into a high-dimensional\nlatent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler\n(KL) divergence loss are used to fit the Gaussian components of the GMM and\nlearn the data distribution. Finally, image clustering is achieved through each\nGaussian component of GMM. Yet, the \"curse of dimensionality\" poses severe\nchallenges for most clustering algorithms. Compared with the classic\nExpectation-Maximization (EM) Algorithm, experimental results show that MCMarg\nand KL divergence can greatly alleviate the difficulty. Based on the\nexperimental results, we believe distribution learning can exploit the\npotential of GMM in image clustering within high-dimensional space.\n","authors":["Guanfang Dong","Chenqiu Zhao","Anup Basu"],"pdf_url":"https://arxiv.org/pdf/2308.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15660v1","updated":"2023-08-29T22:43:46Z","published":"2023-08-29T22:43:46Z","title":"Unveiling Camouflage: A Learnable Fourier-based Augmentation for\n Camouflaged Object Detection and Instance Segmentation","summary":" Camouflaged object detection (COD) and camouflaged instance segmentation\n(CIS) aim to recognize and segment objects that are blended into their\nsurroundings, respectively. While several deep neural network models have been\nproposed to tackle those tasks, augmentation methods for COD and CIS have not\nbeen thoroughly explored. Augmentation strategies can help improve the\nperformance of models by increasing the size and diversity of the training data\nand exposing the model to a wider range of variations in the data. Besides, we\naim to automatically learn transformations that help to reveal the underlying\nstructure of camouflaged objects and allow the model to learn to better\nidentify and segment camouflaged objects. To achieve this, we propose a\nlearnable augmentation method in the frequency domain for COD and CIS via\nFourier transform approach, dubbed CamoFourier. Our method leverages a\nconditional generative adversarial network and cross-attention mechanism to\ngenerate a reference image and an adaptive hybrid swapping with parameters to\nmix the low-frequency component of the reference image and the high-frequency\ncomponent of the input image. This approach aims to make camouflaged objects\nmore visible for detection and segmentation models. Without bells and whistles,\nour proposed augmentation method boosts the performance of camouflaged object\ndetectors and camouflaged instance segmenters by large margins.\n","authors":["Minh-Quan Le","Minh-Triet Tran","Trung-Nghia Le","Tam V. Nguyen","Thanh-Toan Do"],"pdf_url":"https://arxiv.org/pdf/2308.15660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11418v2","updated":"2023-08-29T22:36:22Z","published":"2023-01-26T21:09:45Z","title":"Parkinson gait modelling from an anomaly deep representation","summary":" Parkinson's Disease (PD) is associated with gait movement disorders, such as\nbradykinesia, stiffness, tremors and postural instability, caused by\nprogressive dopamine deficiency. Today, some approaches have implemented\nlearning representations to quantify kinematic patterns during locomotion,\nsupporting clinical procedures such as diagnosis and treatment planning. These\napproaches assumes a large amount of stratified and labeled data to optimize\ndiscriminative representations. Nonetheless these considerations may restrict\nthe approaches to be operable in real scenarios during clinical practice. This\nwork introduces a self-supervised generative representation to learn\ngait-motion-related patterns, under the pretext of video reconstruction and an\nanomaly detection framework. This architecture is trained following a one-class\nweakly supervised learning to avoid inter-class variance and approach the\nmultiple relationships that represent locomotion. The proposed approach was\nvalidated with 14 PD patients and 23 control subjects, and trained with the\ncontrol population only, achieving an AUC of 95%, homocedasticity level of 70%\nand shapeness level of 70% in the classification task considering its\ngeneralization.\n","authors":["Edgar Rangel","Fabio Martinez"],"pdf_url":"https://arxiv.org/pdf/2301.11418v2.pdf","comment":"Journal not submitted to any editorial"},{"id":"http://arxiv.org/abs/2305.02422v3","updated":"2023-08-29T22:12:04Z","published":"2023-05-03T20:29:04Z","title":"GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content","summary":" The mobile cloud gaming industry has been rapidly growing over the last\ndecade. When streaming gaming videos are transmitted to customers' client\ndevices from cloud servers, algorithms that can monitor distorted video quality\nwithout having any reference video available are desirable tools. However,\ncreating No-Reference Video Quality Assessment (NR VQA) models that can\naccurately predict the quality of streaming gaming videos rendered by computer\ngraphics engines is a challenging problem, since gaming content generally\ndiffers statistically from naturalistic videos, often lacks detail, and\ncontains many smooth regions. Until recently, the problem has been further\ncomplicated by the lack of adequate subjective quality databases of mobile\ngaming content. We have created a new gaming-specific NR VQA model called the\nGaming Video Quality Evaluator (GAMIVAL), which combines and leverages the\nadvantages of spatial and temporal gaming distorted scene statistics models, a\nneural noise model, and deep semantic features. Using a support vector\nregression (SVR) as a regressor, GAMIVAL achieves superior performance on the\nnew LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database.\n","authors":["Yu-Chih Chen","Avinab Saha","Chase Davis","Bo Qiu","Xiaoming Wang","Rahul Gowda","Ioannis Katsavounidis","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2305.02422v3.pdf","comment":"Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been\n made available online: https://github.com/lskdream/GAMIVAL"},{"id":"http://arxiv.org/abs/2303.02698v3","updated":"2023-08-29T20:57:07Z","published":"2023-03-05T15:27:24Z","title":"Robust affine point matching via quadratic assignment on Grassmannians","summary":" Robust Affine matching with Grassmannians (RAG) is a new algorithm to perform\naffine registration of point clouds. The algorithm is based on minimizing the\nFrobenius distance between two elements of the Grassmannian. For this purpose,\nan indefinite relaxation of the Quadratic Assignment Problem (QAP) is used, and\nseveral approaches to affine feature matching are studied and compared.\nExperiments demonstrate that RAG is more robust to noise and point discrepancy\nthan previous methods.\n","authors":["Alexander Kolpakov","Michael Werman"],"pdf_url":"https://arxiv.org/pdf/2303.02698v3.pdf","comment":"8 pages, 23 figures; GitHub repository at\n (https://github.com/sashakolpakov/rag)"},{"id":"http://arxiv.org/abs/2305.18221v3","updated":"2023-08-29T20:52:57Z","published":"2023-05-29T17:01:54Z","title":"GazeGNN: A Gaze-Guided Graph Neural Network for Chest X-ray\n Classification","summary":" Eye tracking research is important in computer vision because it can help us\nunderstand how humans interact with the visual world. Specifically for\nhigh-risk applications, such as in medical imaging, eye tracking can help us to\ncomprehend how radiologists and other medical professionals search, analyze,\nand interpret images for diagnostic and clinical purposes. Hence, the\napplication of eye tracking techniques in disease classification has become\nincreasingly popular in recent years. Contemporary works usually transform gaze\ninformation collected by eye tracking devices into visual attention maps (VAMs)\nto supervise the learning process. However, this is a time-consuming\npreprocessing step, which stops us from applying eye tracking to radiologists'\ndaily work. To solve this problem, we propose a novel gaze-guided graph neural\nnetwork (GNN), GazeGNN, to leverage raw eye-gaze data without being converted\ninto VAMs. In GazeGNN, to directly integrate eye gaze into image\nclassification, we create a unified representation graph that models both\nimages and gaze pattern information. With this benefit, we develop a real-time,\nreal-world, end-to-end disease classification algorithm for the first time in\nthe literature. This achievement demonstrates the practicality and feasibility\nof integrating real-time eye tracking techniques into the daily work of\nradiologists. To our best knowledge, GazeGNN is the first work that adopts GNN\nto integrate image and eye-gaze data. Our experiments on the public chest X-ray\ndataset show that our proposed method exhibits the best classification\nperformance compared to existing methods. The code is available at\nhttps://github.com/ukaukaaaa/GazeGNN.\n","authors":["Bin Wang","Hongyi Pan","Armstrong Aboah","Zheyuan Zhang","Elif Keles","Drew Torigian","Baris Turkbey","Elizabeth Krupinski","Jayaram Udupa","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2305.18221v3.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2308.15624v1","updated":"2023-08-29T20:45:41Z","published":"2023-08-29T20:45:41Z","title":"Detection of Mild Cognitive Impairment Using Facial Features in Video\n Conversations","summary":" Early detection of Mild Cognitive Impairment (MCI) leads to early\ninterventions to slow the progression from MCI into dementia. Deep Learning\n(DL) algorithms could help achieve early non-invasive, low-cost detection of\nMCI. This paper presents the detection of MCI in older adults using DL models\nbased only on facial features extracted from video-recorded conversations at\nhome. We used the data collected from the I-CONECT behavioral intervention\nstudy (NCT02871921), where several sessions of semi-structured interviews\nbetween socially isolated older individuals and interviewers were video\nrecorded. We develop a framework that extracts spatial holistic facial features\nusing a convolutional autoencoder and temporal information using transformers.\nOur proposed DL model was able to detect the I-CONECT study participants'\ncognitive conditions (MCI vs. those with normal cognition (NC)) using facial\nfeatures. The segments and sequence information of the facial features improved\nthe prediction performance compared with the non-temporal features. The\ndetection accuracy using this combined method reached 88% whereas 84% is the\naccuracy without applying the segments and sequences information of the facial\nfeatures within a video on a certain theme.\n","authors":["Muath Alsuhaibani","Hiroko H. Dodge","Mohammad H. Mahoor"],"pdf_url":"https://arxiv.org/pdf/2308.15624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15618v1","updated":"2023-08-29T20:25:49Z","published":"2023-08-29T20:25:49Z","title":"RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware\n Contextual Reasoning on Whole Slide Images","summary":" Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer\nin the US. It is diagnosed by manual multi-class tumor grading using a tissue\nwhole slide image (WSI), which is subjective and suffers from inter-pathologist\nvariability. We propose an automated weakly-supervised grading approach for\ncSCC WSIs that is trained using WSI-level grade and does not require\nfine-grained tumor annotations. The proposed model, RACR-MIL, transforms each\nWSI into a bag of tiled patches and leverages attention-based multiple-instance\nlearning to assign a WSI-level grade. We propose three key innovations to\naddress general as well as cSCC-specific challenges in tumor grading. First, we\nleverage spatial and semantic proximity to define a WSI graph that encodes both\nlocal and non-local dependencies between tumor regions and leverage graph\nattention convolution to derive contextual patch features. Second, we introduce\na novel ordinal ranking constraint on the patch attention network to ensure\nthat higher-grade tumor regions are assigned higher attention. Third, we use\ntumor depth as an auxiliary task to improve grade classification in a multitask\nlearning framework. RACR-MIL achieves 2-9% improvement in grade classification\nover existing weakly-supervised approaches on a dataset of 718 cSCC tissue\nimages and localizes the tumor better. The model achieves 5-20% higher accuracy\nin difficult-to-classify high-risk grade classes and is robust to class\nimbalance.\n","authors":["Anirudh Choudhary","Angelina Hwang","Jacob Kechter","Krishnakant Saboo","Blake Bordeaux","Puneet Bhullar","Nneka Comfere","David DiCaudo","Steven Nelson","Emma Johnson","Leah Swanson","Dennis Murphree","Aaron Mangold","Ravishankar K. Iyer"],"pdf_url":"https://arxiv.org/pdf/2308.15618v1.pdf","comment":"7 pages main text, 2 page references, 3 page appendix; submitted to\n AAAI"},{"id":"http://arxiv.org/abs/2308.15575v1","updated":"2023-08-29T19:04:42Z","published":"2023-08-29T19:04:42Z","title":"Prototype Fission: Closing Set for Robust Open-set Semi-supervised\n Learning","summary":" Semi-supervised Learning (SSL) has been proven vulnerable to\nout-of-distribution (OOD) samples in realistic large-scale unsupervised\ndatasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A\nkey underlying problem is class-wise latent space spreading from closed seen\nspace to open unseen space, and the bias is further magnified in SSL's\nself-training loops. To close the ID distribution set so that OODs are better\nrejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise\nlatent spaces into compact sub-spaces by automatic fine-grained latent space\nmining, driven by coarse-grained labels only. Specifically, we form multiple\nunique learnable sub-class prototypes for each class, optimized towards both\ndiversity and consistency. The Diversity Modeling term encourages samples to be\nclustered by one of the multiple sub-class prototypes, while the Consistency\nModeling term clusters all samples of the same class to a global prototype.\nInstead of \"opening set\", i.e., modeling OOD distribution, Prototype Fission\n\"closes set\" and makes it hard for OOD samples to fit in sub-class latent\nspace. Therefore, PF is compatible with existing methods for further\nperformance gains. Extensive experiments validate the effectiveness of our\nmethod in open-set SSL settings in terms of successfully forming sub-classes,\ndiscriminating OODs from IDs and improving overall accuracy. Codes will be\nreleased.\n","authors":["Xuwei Tan","Yi-Jie Huang","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2308.15575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12542v2","updated":"2023-08-29T18:40:19Z","published":"2022-11-22T19:18:30Z","title":"CASSPR: Cross Attention Single Scan Place Recognition","summary":" Place recognition based on point clouds (LiDAR) is an important component for\nautonomous robots or self-driving vehicles. Current SOTA performance is\nachieved on accumulated LiDAR submaps using either point-based or voxel-based\nstructures. While voxel-based approaches nicely integrate spatial context\nacross multiple scales, they do not exhibit the local precision of point-based\nmethods. As a result, existing methods struggle with fine-grained matching of\nsubtle geometric features in sparse single-shot Li- DAR scans. To overcome\nthese limitations, we propose CASSPR as a method to fuse point-based and\nvoxel-based approaches using cross attention transformers. CASSPR leverages a\nsparse voxel branch for extracting and aggregating information at lower\nresolution and a point-wise branch for obtaining fine-grained local\ninformation. CASSPR uses queries from one branch to try to match structures in\nthe other branch, ensuring that both extract self-contained descriptors of the\npoint cloud (rather than one branch dominating), but using both to inform the\noutput global descriptor of the point cloud. Extensive experiments show that\nCASSPR surpasses the state-of-the-art by a large margin on several datasets\n(Oxford RobotCar, TUM, USyd). For instance, it achieves AR@1 of 85.6% on the\nTUM dataset, surpassing the strongest prior model by ~15%. Our code is publicly\navailable.\n","authors":["Yan Xia","Mariia Gladkova","Rui Wang","Qianyun Li","Uwe Stilla","João F. Henriques","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2211.12542v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.15564v1","updated":"2023-08-29T18:36:21Z","published":"2023-08-29T18:36:21Z","title":"Learning Sequential Information in Task-based fMRI for Synthetic Data\n Augmentation","summary":" Insufficiency of training data is a persistent issue in medical image\nanalysis, especially for task-based functional magnetic resonance images (fMRI)\nwith spatio-temporal imaging data acquired using specific cognitive tasks. In\nthis paper, we propose an approach for generating synthetic fMRI sequences that\ncan then be used to create augmented training datasets in downstream learning\ntasks. To synthesize high-resolution task-specific fMRI, we adapt the\n$\\alpha$-GAN structure, leveraging advantages of both GAN and variational\nautoencoder models, and propose different alternatives in aggregating temporal\ninformation. The synthetic images are evaluated from multiple perspectives\nincluding visualizations and an autism spectrum disorder (ASD) classification\ntask. The results show that the synthetic task-based fMRI can provide effective\ndata augmentation in learning the ASD classification task.\n","authors":["Jiyao Wang","Nicha C. Dvornek","Lawrence H. Staib","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2308.15564v1.pdf","comment":"Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI\n workshop), preprint version"},{"id":"http://arxiv.org/abs/2308.15557v1","updated":"2023-08-29T18:24:28Z","published":"2023-08-29T18:24:28Z","title":"A Pseudo-Boolean Polynomials Approach for Image Edge Detection","summary":" We introduce a novel approach for image edge detection based on\npseudo-Boolean polynomials for image patches. We show that patches covering\nedge regions in the image result in pseudo-Boolean polynomials with higher\ndegrees compared to patches that cover blob regions. The proposed approach is\nbased on reduction of polynomial degree and equivalence properties of\npenalty-based pseudo-Boolean polynomials.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin"],"pdf_url":"https://arxiv.org/pdf/2308.15557v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2110.03105v3","updated":"2023-08-29T18:15:10Z","published":"2021-10-06T23:37:21Z","title":"MetaCOG: Learning a Metacognition to Recover What Objects Are Actually\n There","summary":" Humans not only form representations about the world based on what we see,\nbut also learn meta-cognitive representations about how our own vision works.\nThis enables us to recognize when our vision is unreliable (e.g., when we\nrealize that we are experiencing a visual illusion) and enables us to question\nwhat we see. Inspired by this human capacity, we present MetaCOG: a model that\nincreases the robustness of object detectors by learning representations of\ntheir reliability, and does so without feedback. Specifically, MetaCOG is a\nhierarchical probabilistic model that expresses a joint distribution over the\nobjects in a 3D scene and the outputs produced by a detector. When paired with\nan off-the-shelf object detector, MetaCOG takes detections as input and infers\nthe detector's tendencies to miss objects of certain categories and to\nhallucinate objects that are not actually present, all without access to\nground-truth object labels. When paired with three modern neural object\ndetectors, MetaCOG learns useful and accurate meta-cognitive representations,\nresulting in improved performance on the detection task. Additionally, we show\nthat MetaCOG is robust to varying levels of error in the detections. Our\nresults are a proof-of-concept for a novel approach to the problem of\ncorrecting a faulty vision system's errors. The model code, datasets, results,\nand demos are available:\nhttps://osf.io/8b9qt/?view_only=8c1b1c412c6b4e1697e3c7859be2fce6\n","authors":["Marlene Berke","Zhangir Azerbayev","Mario Belledonne","Zenna Tavares","Julian Jara-Ettinger"],"pdf_url":"https://arxiv.org/pdf/2110.03105v3.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.15547v1","updated":"2023-08-29T18:11:32Z","published":"2023-08-29T18:11:32Z","title":"Efficient Ray Sampling for Radiance Fields Reconstruction","summary":" Accelerating neural radiance fields training is of substantial practical\nvalue, as the ray sampling strategy profoundly impacts network convergence.\nMore efficient ray sampling can thus directly enhance existing NeRF models'\ntraining efficiency. We therefore propose a novel ray sampling approach for\nneural radiance fields that improves training efficiency while retaining\nphotorealistic rendering results. First, we analyze the relationship between\nthe pixel loss distribution of sampled rays and rendering quality. This reveals\nredundancy in the original NeRF's uniform ray sampling. Guided by this finding,\nwe develop a sampling method leveraging pixel regions and depth boundaries. Our\nmain idea is to sample fewer rays in training views, yet with each ray more\ninformative for scene fitting. Sampling probability increases in pixel areas\nexhibiting significant color and depth variation, greatly reducing wasteful\nrays from other regions without sacrificing precision. Through this method, not\nonly can the convergence of the network be accelerated, but the spatial\ngeometry of a scene can also be perceived more accurately. Rendering outputs\nare enhanced, especially for texture-complex regions. Experiments demonstrate\nthat our method significantly outperforms state-of-the-art techniques on public\nbenchmark datasets.\n","authors":["Shilei Sun","Ming Liu","Zhongyi Fan","Yuxue Liu","Chengwei Lv","Liquan Dong","Lingqin Kong"],"pdf_url":"https://arxiv.org/pdf/2308.15547v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.15536v1","updated":"2023-08-29T18:00:22Z","published":"2023-08-29T18:00:22Z","title":"DebSDF: Delving into the Details and Bias of Neural Indoor Scene\n Reconstruction","summary":" In recent years, the neural implicit surface has emerged as a powerful\nrepresentation for multi-view surface reconstruction due to its simplicity and\nstate-of-the-art performance. However, reconstructing smooth and detailed\nsurfaces in indoor scenes from multi-view images presents unique challenges.\nIndoor scenes typically contain large texture-less regions, making the\nphotometric loss unreliable for optimizing the implicit surface. Previous work\nutilizes monocular geometry priors to improve the reconstruction in indoor\nscenes. However, monocular priors often contain substantial errors in thin\nstructure regions due to domain gaps and the inherent inconsistencies when\nderived independently from different views. This paper presents \\textbf{DebSDF}\nto address these challenges, focusing on the utilization of uncertainty in\nmonocular priors and the bias in SDF-based volume rendering. We propose an\nuncertainty modeling technique that associates larger uncertainties with larger\nerrors in the monocular priors. High-uncertainty priors are then excluded from\noptimization to prevent bias. This uncertainty measure also informs an\nimportance-guided ray sampling and adaptive smoothness regularization,\nenhancing the learning of fine structures. We further introduce a bias-aware\nsigned distance function to density transformation that takes into account the\ncurvature and the angle between the view direction and the SDF normals to\nreconstruct fine details better. Our approach has been validated through\nextensive experiments on several challenging datasets, demonstrating improved\nqualitative and quantitative results in reconstructing thin structures in\nindoor scenes, thereby outperforming previous work.\n","authors":["Yuting Xiao","Jingwei Xu","Zehao Yu","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2308.15536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15517v1","updated":"2023-08-29T16:58:03Z","published":"2023-08-29T16:58:03Z","title":"Document AI: A Comparative Study of Transformer-Based, Graph-Based\n Models, and Convolutional Neural Networks For Document Layout Analysis","summary":" Document AI aims to automatically analyze documents by leveraging natural\nlanguage processing and computer vision techniques. One of the major tasks of\nDocument AI is document layout analysis, which structures document pages by\ninterpreting the content and spatial relationships of layout, image, and text.\nThis task can be image-centric, wherein the aim is to identify and label\nvarious regions such as authors and paragraphs, or text-centric, where the\nfocus is on classifying individual words in a document. Although there are\nincreasingly sophisticated methods for improving layout analysis, doubts remain\nabout the extent to which their findings can be generalized to a broader\ncontext. Specifically, prior work developed systems based on very different\narchitectures, such as transformer-based, graph-based, and CNNs. However, no\nwork has mentioned the effectiveness of these models in a comparative analysis.\nMoreover, while language-independent Document AI models capable of knowledge\ntransfer have been developed, it remains to be investigated to what degree they\ncan effectively transfer knowledge. In this study, we aim to fill these gaps by\nconducting a comparative evaluation of state-of-the-art models in document\nlayout analysis and investigating the potential of cross-lingual layout\nanalysis by utilizing machine translation techniques.\n","authors":["Sotirios Kastanas","Shaomu Tan","Yi He"],"pdf_url":"https://arxiv.org/pdf/2308.15517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15512v1","updated":"2023-08-29T15:39:15Z","published":"2023-08-29T15:39:15Z","title":"Shatter and Gather: Learning Referring Image Segmentation with Text\n Supervision","summary":" Referring image segmentation, the task of segmenting any arbitrary entities\ndescribed in free-form texts, opens up a variety of vision applications.\nHowever, manual labeling of training data for this task is prohibitively\ncostly, leading to lack of labeled data for training. We address this issue by\na weakly supervised learning approach using text descriptions of training\nimages as the only source of supervision. To this end, we first present a new\nmodel that discovers semantic entities in input image and then combines such\nentities relevant to text query to predict the mask of the referent. We also\npresent a new loss function that allows the model to be trained without any\nfurther supervision. Our method was evaluated on four public benchmarks for\nreferring image segmentation, where it clearly outperformed the existing method\nfor the same task and recent open-vocabulary segmentation models on all the\nbenchmarks.\n","authors":["Dongwon Kim","Namyup Kim","Cuiling Lan","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.15512v1.pdf","comment":"Accepted to ICCV 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2306.08018v2","updated":"2023-08-29T17:13:05Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a meticulously curated, comprehensive\ninstruction dataset expressly designed for the biomolecular realm.\nMol-Instructions is composed of three pivotal components: molecule-oriented\ninstructions, protein-oriented instructions, and biomolecular text\ninstructions, each curated to enhance the understanding and prediction\ncapabilities of LLMs concerning biomolecular features and behaviors. Through\nextensive instruction tuning experiments on the representative LLM, we\nunderscore the potency of Mol-Instructions to enhance the adaptability and\ncognitive acuity of large models within the complex sphere of biomolecular\nstudies, thereby promoting advancements in the biomolecular research community.\nMol-Instructions is made publicly accessible for future research endeavors and\nwill be subjected to continual updates for enhanced applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v2.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions. Add\n quantitative evaluations"},{"id":"http://arxiv.org/abs/2307.07740v2","updated":"2023-08-29T16:55:11Z","published":"2023-07-15T08:08:38Z","title":"Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model","summary":" Sentiment analysis is the process of identifying and categorizing people's\nemotions or opinions regarding various topics. The analysis of Twitter\nsentiment has become an increasingly popular topic in recent years. In this\npaper, we present several machine learning and a deep learning model to\nanalysis sentiment of Persian political tweets. Our analysis was conducted\nusing Bag of Words and ParsBERT for word representation. We applied Gaussian\nNaive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random\nForests, as well as a combination of CNN and LSTM to classify the polarities of\ntweets. The results of this study indicate that deep learning with ParsBERT\nembedding performs better than machine learning. The CNN-LSTM model had the\nhighest classification accuracy with 89 percent on the first dataset and 71\npercent on the second dataset. Due to the complexity of Persian, it was a\ndifficult task to achieve this level of efficiency. The main objective of our\nresearch was to reduce the training time while maintaining the model's\nperformance. As a result, several adjustments were made to the model\narchitecture and parameters. In addition to achieving the objective, the\nperformance was slightly improved as well.\n","authors":["Mohammad Dehghani","Zahra Yazdanparast"],"pdf_url":"https://arxiv.org/pdf/2307.07740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15405v1","updated":"2023-08-29T16:07:18Z","published":"2023-08-29T16:07:18Z","title":"Robust Long-Tailed Learning via Label-Aware Bounded CVaR","summary":" Data in the real-world classification problems are always imbalanced or\nlong-tailed, wherein the majority classes have the most of the samples that\ndominate the model training. In such setting, the naive model tends to have\npoor performance on the minority classes. Previously, a variety of loss\nmodifications have been proposed to address the long-tailed leaning problem,\nwhile these methods either treat the samples in the same class\nindiscriminatingly or lack a theoretical guarantee. In this paper, we propose\ntwo novel approaches based on CVaR (Conditional Value at Risk) to improve the\nperformance of long-tailed learning with a solid theoretical ground.\nSpecifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss\nto overcome the pessimistic result of the original CVaR, and further design the\noptimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we\nadditionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to\nstabilize the optimization process, where we also offer the theoretical\nsupport. Extensive experiments on real-world datasets with long-tailed label\ndistributions verify the superiority of our proposed methods.\n","authors":["Hong Zhu","Runpeng Yu","Xing Tang","Yifei Wang","Yuan Fang","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15265v1","updated":"2023-08-29T12:50:21Z","published":"2023-08-29T12:50:21Z","title":"A Multi-Perspective Learning to Rank Approach to Support Children's\n Information Seeking in the Classroom","summary":" We introduce a novel re-ranking model that aims to augment the functionality\nof standard search engines to support classroom search activities for children\n(ages 6 to 11). This model extends the known listwise learning-to-rank\nframework by balancing risk and reward. Doing so enables the model to\nprioritize Web resources of high educational alignment, appropriateness, and\nadequate readability by analyzing the URLs, snippets, and page titles of Web\nresources retrieved by a given mainstream search engine. Experimental results,\nincluding an ablation study and comparisons with existing baselines, showcase\nthe correctness of the proposed model. The outcomes of this work demonstrate\nthe value of considering multiple perspectives inherent to the classroom\nsetting, e.g., educational alignment, readability, and objectionability, when\napplied to the design of algorithms that can better support children's\ninformation discovery.\n","authors":["Garrett Allen","Katherine Landau Wright","Jerry Alan Fails","Casey Kennington","Maria Soledad Pera"],"pdf_url":"https://arxiv.org/pdf/2308.15265v1.pdf","comment":"Extended version of the manuscript to appear in proceedings of the\n 22nd IEEE/WIC International Conference on Web Intelligence and Intelligent\n Agent Technology"},{"id":"http://arxiv.org/abs/2308.15244v1","updated":"2023-08-29T12:11:16Z","published":"2023-08-29T12:11:16Z","title":"Knowledge-based Multiple Adaptive Spaces Fusion for Recommendation","summary":" Since Knowledge Graphs (KGs) contain rich semantic information, recently\nthere has been an influx of KG-enhanced recommendation methods. Most of\nexisting methods are entirely designed based on euclidean space without\nconsidering curvature. However, recent studies have revealed that a tremendous\ngraph-structured data exhibits highly non-euclidean properties. Motivated by\nthese observations, in this work, we propose a knowledge-based multiple\nadaptive spaces fusion method for recommendation, namely MCKG. Unlike existing\nmethods that solely adopt a specific manifold, we introduce the unified space\nthat is compatible with hyperbolic, euclidean and spherical spaces.\nFurthermore, we fuse the multiple unified spaces in an attention manner to\nobtain the high-quality embeddings for better knowledge propagation. In\naddition, we propose a geometry-aware optimization strategy which enables the\npull and push processes benefited from both hyperbolic and spherical spaces.\nSpecifically, in hyperbolic space, we set smaller margins in the area near to\nthe origin, which is conducive to distinguishing between highly similar\npositive items and negative ones. At the same time, we set larger margins in\nthe area far from the origin to ensure the model has sufficient error\ntolerance. The similar manner also applies to spherical spaces. Extensive\nexperiments on three real-world datasets demonstrate that the MCKG has a\nsignificant improvement over state-of-the-art recommendation methods. Further\nablation experiments verify the importance of multi-space fusion and\ngeometry-aware optimization strategy, justifying the rationality and\neffectiveness of MCKG.\n","authors":["Meng Yuan","Fuzhen Zhuang","Zhao Zhang","Deqing Wang","Jin Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15232v1","updated":"2023-08-29T11:40:24Z","published":"2023-08-29T11:40:24Z","title":"Classification-Aware Neural Topic Model Combined With Interpretable\n Analysis -- For Conflict Classification","summary":" A large number of conflict events are affecting the world all the time. In\norder to analyse such conflict events effectively, this paper presents a\nClassification-Aware Neural Topic Model (CANTM-IA) for Conflict Information\nClassification and Topic Discovery. The model provides a reliable\ninterpretation of classification results and discovered topics by introducing\ninterpretability analysis. At the same time, interpretation is introduced into\nthe model architecture to improve the classification performance of the model\nand to allow interpretation to focus further on the details of the data.\nFinally, the model architecture is optimised to reduce the complexity of the\nmodel.\n","authors":["Tianyu Liang","Yida Mu","Soonho Kim","Darline Larissa Kengne Kuate","Julie Lang","Rob Vos","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.15232v1.pdf","comment":"Accepted by RANLP 2023"},{"id":"http://arxiv.org/abs/2308.15230v1","updated":"2023-08-29T11:37:33Z","published":"2023-08-29T11:37:33Z","title":"Providing Previously Unseen Users Fair Recommendations Using Variational\n Autoencoders","summary":" An emerging definition of fairness in machine learning requires that models\nare oblivious to demographic user information, e.g., a user's gender or age\nshould not influence the model. Personalized recommender systems are\nparticularly prone to violating this definition through their explicit user\nfocus and user modelling. Explicit user modelling is also an aspect that makes\nmany recommender systems incapable of providing hitherto unseen users with\nrecommendations. We propose novel approaches for mitigating discrimination in\nVariational Autoencoder-based recommender systems by limiting the encoding of\ndemographic information. The approaches are capable of, and evaluated on,\nproviding users that are not represented in the training data with fair\nrecommendations.\n","authors":["Bjørnar Vassøy","Helge Langseth","Benjamin Kille"],"pdf_url":"https://arxiv.org/pdf/2308.15230v1.pdf","comment":"Appearing in RecSys 2023 proceedings"},{"id":"http://arxiv.org/abs/2308.15136v1","updated":"2023-08-29T09:10:53Z","published":"2023-08-29T09:10:53Z","title":"CAGRA: Highly Parallel Graph Construction and Approximate Nearest\n Neighbor Search for GPUs","summary":" Approximate Nearest Neighbor Search (ANNS) plays a critical role in various\ndisciplines spanning data mining and artificial intelligence, from information\nretrieval and computer vision to natural language processing and recommender\nsystems. Data volumes have soared in recent years and the computational cost of\nan exhaustive exact nearest neighbor search is often prohibitive, necessitating\nthe adoption of approximate techniques. The balanced performance and recall of\ngraph-based approaches have more recently garnered significant attention in\nANNS algorithms, however, only a few studies have explored harnessing the power\nof GPUs and multi-core processors despite the widespread use of massively\nparallel and general-purpose computing. To bridge this gap, we introduce a\nnovel parallel computing hardware-based proximity graph and search algorithm.\nBy leveraging the high-performance capabilities of modern hardware, our\napproach achieves remarkable efficiency gains. In particular, our method\nsurpasses existing CPU and GPU-based methods in constructing the proximity\ngraph, demonstrating higher throughput in both large- and small-batch searches\nwhile maintaining compatible accuracy. In graph construction time, our method,\nCAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA\nimplementations. In large-batch query throughput in the 90% to 95% recall\nrange, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the\nSOTA implementations for GPU. For a single query, our method is 3.4~53x faster\nthan HNSW at 95% recall.\n","authors":["Hiroyuki Ootomo","Akira Naruse","Corey Nolet","Ray Wang","Tamas Feher","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15090v1","updated":"2023-08-29T07:53:17Z","published":"2023-08-29T07:53:17Z","title":"Killing two birds with one stone: Can an audio captioning system also be\n used for audio-text retrieval?","summary":" Automated Audio Captioning (AAC) aims to develop systems capable of\ndescribing an audio recording using a textual sentence. In contrast, Audio-Text\nRetrieval (ATR) systems seek to find the best matching audio recording(s) for a\ngiven textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks\nrequire different types of systems: AAC employs a sequence-to-sequence model,\nwhile ATR utilizes a ranking model that compares audio and text representations\nwithin a shared projection subspace. However, this work investigates the\nrelationship between AAC and ATR by exploring the ATR capabilities of an\nunmodified AAC system, without fine-tuning for the new task. Our AAC system\nconsists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio\ntagging, and a transformer decoder responsible for generating sentences. For\nAAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on\nAudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss\nvalues obtained for any audio/caption pair. Experimental results on the Clotho\nand AudioCaps datasets demonstrate decent recall values using this simple\napproach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for\nAu-dioCaps, which is above the current state-of-the-art method without external\ndata. Interestingly, we observe that normalizing the loss values was necessary\nfor Audio-to-Text retrieval.\n","authors":["Etienne Labbé","Thomas Pellegrini","Julien Pinquier"],"pdf_url":"https://arxiv.org/pdf/2308.15090v1.pdf","comment":"cam ready version (14/08/23)"},{"id":"http://arxiv.org/abs/2308.15033v1","updated":"2023-08-29T05:35:49Z","published":"2023-08-29T05:35:49Z","title":"STEC: See-Through Transformer-based Encoder for CTR Prediction","summary":" Click-Through Rate (CTR) prediction holds a pivotal place in online\nadvertising and recommender systems since CTR prediction performance directly\ninfluences the overall satisfaction of the users and the revenue generated by\ncompanies. Even so, CTR prediction is still an active area of research since it\ninvolves accurately modelling the preferences of users based on sparse and\nhigh-dimensional features where the higher-order interactions of multiple\nfeatures can lead to different outcomes. Most CTR prediction models have relied\non a single fusion and interaction learning strategy. The few CTR prediction\nmodels that have utilized multiple interaction modelling strategies have\ntreated each interaction to be self-contained. In this paper, we propose a\nnovel model named STEC that reaps the benefits of multiple interaction learning\napproaches in a single unified architecture. Additionally, our model introduces\nresidual connections from different orders of interactions which boosts the\nperformance by allowing lower level interactions to directly affect the\npredictions. Through extensive experiments on four real-world datasets, we\ndemonstrate that STEC outperforms existing state-of-the-art approaches for CTR\nprediction thanks to its greater expressive capabilities.\n","authors":["Serdarcan Dilbaz","Hasan Saribas"],"pdf_url":"https://arxiv.org/pdf/2308.15033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15027v1","updated":"2023-08-29T05:18:47Z","published":"2023-08-29T05:18:47Z","title":"Improving Neural Ranking Models with Traditional IR Methods","summary":" Neural ranking methods based on large transformer models have recently gained\nsignificant attention in the information retrieval community, and have been\nadopted by major commercial solutions. Nevertheless, they are computationally\nexpensive to create, and require a great deal of labeled data for specialized\ncorpora. In this paper, we explore a low resource alternative which is a\nbag-of-embedding model for document retrieval and find that it is competitive\nwith large transformer models fine tuned on information retrieval tasks. Our\nresults show that a simple combination of TF-IDF, a traditional keyword\nmatching method, with a shallow embedding model provides a low cost path to\ncompete well with the performance of complex neural ranking models on 3\ndatasets. Furthermore, adding TF-IDF measures improves the performance of\nlarge-scale fine tuned models on these tasks.\n","authors":["Anik Saha","Oktie Hassanzadeh","Alex Gittens","Jian Ni","Kavitha Srinivas","Bulent Yener"],"pdf_url":"https://arxiv.org/pdf/2308.15027v1.pdf","comment":"Short paper, 4 pages"},{"id":"http://arxiv.org/abs/2308.15014v1","updated":"2023-08-29T04:34:32Z","published":"2023-08-29T04:34:32Z","title":"CAPS: A Practical Partition Index for Filtered Similarity Search","summary":" With the surging popularity of approximate near-neighbor search (ANNS),\ndriven by advances in neural representation learning, the ability to serve\nqueries accompanied by a set of constraints has become an area of intense\ninterest. While the community has recently proposed several algorithms for\nconstrained ANNS, almost all of these methods focus on integration with\ngraph-based indexes, the predominant class of algorithms achieving\nstate-of-the-art performance in latency-recall tradeoffs. In this work, we take\na different approach and focus on developing a constrained ANNS algorithm via\nspace partitioning as opposed to graphs. To that end, we introduce Constrained\nApproximate Partitioned Search (CAPS), an index for ANNS with filters via space\npartitions that not only retains the benefits of a partition-based algorithm\nbut also outperforms state-of-the-art graph-based constrained search techniques\nin recall-latency tradeoffs, with only 10% of the index size.\n","authors":["Gaurav Gupta","Jonah Yi","Benjamin Coleman","Chen Luo","Vihan Lakshman","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2308.15014v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2308.14968v1","updated":"2023-08-29T01:46:06Z","published":"2023-08-29T01:46:06Z","title":"Continual Learning for Generative Retrieval over Dynamic Corpora","summary":" Generative retrieval (GR) directly predicts the identifiers of relevant\ndocuments (i.e., docids) based on a parametric model. It has achieved solid\nperformance on many ad-hoc retrieval tasks. So far, these tasks have assumed a\nstatic document collection. In many practical scenarios, however, document\ncollections are dynamic, where new documents are continuously added to the\ncorpus. The ability to incrementally index new documents while preserving the\nability to answer queries with both previously and newly indexed relevant\ndocuments is vital to applying GR models. In this paper, we address this\npractical continual learning problem for GR. We put forward a novel\nContinual-LEarner for generatiVE Retrieval (CLEVER) model and make two major\ncontributions to continual learning for GR: (i) To encode new documents into\ndocids with low computational cost, we present Incremental Product\nQuantization, which updates a partial quantization codebook according to two\nadaptive thresholds; and (ii) To memorize new documents for querying without\nforgetting previous knowledge, we propose a memory-augmented learning\nmechanism, to form meaningful connections between old and new documents.\nEmpirical results demonstrate the effectiveness and efficiency of the proposed\nmodel.\n","authors":["Jiangui Chen","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Wei Chen","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.14968v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.14963v1","updated":"2023-08-29T01:30:23Z","published":"2023-08-29T01:30:23Z","title":"Vector Search with OpenAI Embeddings: Lucene Is All You Need","summary":" We provide a reproducible, end-to-end demonstration of vector search with\nOpenAI embeddings using Lucene on the popular MS MARCO passage ranking test\ncollection. The main goal of our work is to challenge the prevailing narrative\nthat a dedicated vector store is necessary to take advantage of recent advances\nin deep neural networks as applied to search. Quite the contrary, we show that\nhierarchical navigable small-world network (HNSW) indexes in Lucene are\nadequate to provide vector search capabilities in a standard bi-encoder\narchitecture. This suggests that, from a simple cost-benefit analysis, there\ndoes not appear to be a compelling reason to introduce a dedicated vector store\ninto a modern \"AI stack\" for search, since such applications have already\nreceived substantial investments in existing, widely deployed infrastructure.\n","authors":["Jimmy Lin","Ronak Pradeep","Tommaso Teofili","Jasper Xian"],"pdf_url":"https://arxiv.org/pdf/2308.14963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15651v1","updated":"2023-08-29T22:03:17Z","published":"2023-08-29T22:03:17Z","title":"Ensuring User-side Fairness in Dynamic Recommender Systems","summary":" User-side group fairness is crucial for modern recommender systems, as it\naims to alleviate performance disparity between groups of users defined by\nsensitive attributes such as gender, race, or age. We find that the disparity\ntends to persist or even increase over time. This calls for effective ways to\naddress user-side fairness in a dynamic environment, which has been\ninfrequently explored in the literature. However, fairness-constrained\nre-ranking, a typical method to ensure user-side fairness (i.e., reducing\nperformance disparity), faces two fundamental challenges in the dynamic\nsetting: (1) non-differentiability of the ranking-based fairness constraint,\nwhich hinders the end-to-end training paradigm, and (2) time-inefficiency,\nwhich impedes quick adaptation to changes in user preferences. In this paper,\nwe propose FAir Dynamic rEcommender (FADE), an end-to-end framework with\nfine-tuning strategy to dynamically alleviate performance disparity. To tackle\nthe above challenges, FADE uses a novel fairness loss designed to be\ndifferentiable and lightweight to fine-tune model parameters to ensure both\nuser-side fairness and high-quality recommendations. Via extensive experiments\non the real-world dataset, we empirically demonstrate that FADE effectively and\nefficiently reduces performance disparity, and furthermore, FADE improves\noverall recommendation quality over time compared to not using any new data.\n","authors":["Hyunsik Yoo","Zhichen Zeng","Jian Kang","Zhining Liu","David Zhou","Fei Wang","Eunice Chan","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2308.15651v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.08303v3","updated":"2023-08-29T21:52:58Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n Models","summary":" Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v3.pdf","comment":"fix typos"},{"id":"http://arxiv.org/abs/2304.08873v2","updated":"2023-08-29T20:23:41Z","published":"2023-04-18T10:07:09Z","title":"Dual-Granularity Contrastive Learning for Session-based Recommendation","summary":" Session-based recommendation systems(SBRS) are more suitable for the current\ne-commerce and streaming media recommendation scenarios and thus have become a\nhot topic. The data encountered by SBRS is typically highly sparse, which also\nserves as one of the bottlenecks limiting the accuracy of recommendations. So\nContrastive Learning(CL) is applied in SBRS owing to its capability of\nimproving embedding learning under the condition of sparse data. However,\nexisting CL strategies are limited in their ability to enforce finer-grained\n(e.g., factor-level) comparisons and, as a result, are unable to capture subtle\ndifferences between instances. More than that, these strategies usually use\nitem or segment dropout as a means of data augmentation which may result in\nsparser data and thus ineffective self-supervised signals. By addressing the\ntwo aforementioned limitations, we introduce a novel multi-granularity CL\nframework. Specifically, two extra augmented embedding convolution channels\nwith different granularities are constructed and the embeddings learned by them\nare compared with those learned from original view to complete the CL tasks. At\nfactor-level, we employ Disentangled Representation Learning to obtain\nfiner-grained data(e.g. factor-level embeddings), with which we can construct\nfactor-level convolution channels. At item-level, the star graph is deployed as\nthe augmented data and graph convolution on it can ensure the effectiveness of\nself-supervised signals. Compare the learned embeddings of these two views with\nthe learned embeddings of the basic view to achieve CL at two granularities.\nFinally, the more precise item-level and factor-level embeddings obtained are\nreferenced to generate personalized recommendations for the user. The proposed\nmodel is validated through extensive experiments on two benchmark datasets,\nshowcasing superior performance compared to existing methods.\n","authors":["Zihan Wang","Gang Wu","Haotong Wang"],"pdf_url":"https://arxiv.org/pdf/2304.08873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14935v2","updated":"2023-08-29T19:52:02Z","published":"2022-11-27T21:00:31Z","title":"RecXplainer: Amortized Attribute-based Personalized Explanations for\n Recommender Systems","summary":" Recommender systems influence many of our interactions in the digital world\n-- impacting how we shop for clothes, sorting what we see when browsing YouTube\nor TikTok, and determining which restaurants and hotels we are shown when using\nhospitality platforms. Modern recommender systems are large, opaque models\ntrained on a mixture of proprietary and open-source datasets. Naturally, issues\nof trust arise on both the developer and user side: is the system working\ncorrectly, and why did a user receive (or not receive) a particular\nrecommendation? Providing an explanation alongside a recommendation alleviates\nsome of these concerns. The status quo for auxiliary recommender system\nfeedback is either user-specific explanations (e.g., \"users who bought item B\nalso bought item A\") or item-specific explanations (e.g., \"we are recommending\nitem A because you watched/bought item B\"). However, users bring personalized\ncontext into their search experience, valuing an item as a function of that\nitem's attributes and their own personal preferences. In this work, we propose\nRecXplainer, a novel method for generating fine-grained explanations based on a\nuser's preferences over the attributes of recommended items. We evaluate\nRecXplainer on five real-world and large-scale recommendation datasets using\nfive different kinds of recommender systems to demonstrate the efficacy of\nRecXplainer in capturing users' preferences over item attributes and using them\nto explain recommendations. We also compare RecXplainer to five baselines and\nshow RecXplainer's exceptional performance on ten metrics.\n","authors":["Sahil Verma","Chirag Shah","John P. Dickerson","Anurag Beniwal","Narayanan Sadagopan","Arjun Seshadri"],"pdf_url":"https://arxiv.org/pdf/2211.14935v2.pdf","comment":"Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022"},{"id":"http://arxiv.org/abs/2308.15553v1","updated":"2023-08-29T18:19:36Z","published":"2023-08-29T18:19:36Z","title":"Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster\n Analysis","summary":" We introduce usage of a reduction property of penalty-based formulation of\npseudo-Boolean polynomials as a mechanism for invariant dimensionality\nreduction in cluster analysis processes. In our experiments, we show that\nmultidimensional data, like 4-dimensional Iris Flower dataset can be reduced to\n2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer\n(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or\nplanes that lie between reduced samples we can extract clusters in a linear and\nunbiased manner with competitive accuracies, reproducibility and clear\ninterpretation.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin"],"pdf_url":"https://arxiv.org/pdf/2308.15553v1.pdf","comment":"14 pages, 4 figures, submitted to the International Conference Data\n Analysis, Optimization and Their Applications on the Occasion of Boris\n Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region,\n Moscow Institute of Physics and Technology\n https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php"},{"id":"http://arxiv.org/abs/2308.15498v1","updated":"2023-08-29T02:42:52Z","published":"2023-08-29T02:42:52Z","title":"Chunked Lists versus Extensible Arrays for Text Inversion","summary":" In our 2017 work on in-memory list-based text inversion [Hawking and\nBillerbeck. Efficient In-Memory, List-Based Text Inversion. ADCS 2017] we\ncompared memory use and indexing speed of a considerable number of variants of\nchunked linked lists. In the present work we compare the best performing of\nthose variants (FBB - dynamic Fibonacci chunking) with the extensible SQ array\ntechnique (SQA) presented in [Moffat and Mackenzie. Immediate-Access Indexing\nUsing Space-Efficient Extensible Arrays. ADCS 2023].\n","authors":["David Hawking","Bodo Billerbeck"],"pdf_url":"https://arxiv.org/pdf/2308.15498v1.pdf","comment":"2 pages, 2 figures, 1 table"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.15479v1","updated":"2023-08-29T17:58:55Z","published":"2023-08-29T17:58:55Z","title":"3D Adversarial Augmentations for Robust Out-of-Domain Predictions","summary":" Since real-world training datasets cannot properly sample the long tail of\nthe underlying data distribution, corner cases and rare out-of-domain samples\ncan severely hinder the performance of state-of-the-art models. This problem\nbecomes even more severe for dense tasks, such as 3D semantic segmentation,\nwhere points of non-standard objects can be confidently associated to the wrong\nclass. In this work, we focus on improving the generalization to out-of-domain\ndata. We achieve this by augmenting the training set with adversarial examples.\nFirst, we learn a set of vectors that deform the objects in an adversarial\nfashion. To prevent the adversarial examples from being too far from the\nexisting data distribution, we preserve their plausibility through a series of\nconstraints, ensuring sensor-awareness and shapes smoothness. Then, we perform\nadversarial augmentation by applying the learned sample-independent vectors to\nthe available objects when training a model. We conduct extensive experiments\nacross a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D\nobject detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D\nsemantic segmentation. Despite training on a standard single dataset, our\napproach substantially improves the robustness and generalization of both 3D\nobject detection and 3D semantic segmentation methods to out-of-domain data.\n","authors":["Alexander Lehner","Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.15479v1.pdf","comment":"37 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.15478v1","updated":"2023-08-29T17:57:20Z","published":"2023-08-29T17:57:20Z","title":"An Adaptive Tangent Feature Perspective of Neural Networks","summary":" In order to better understand feature learning in neural networks, we propose\na framework for understanding linear models in tangent feature space where the\nfeatures are allowed to be transformed during training. We consider linear\ntransformations of features, resulting in a joint optimization over parameters\nand transformations with a bilinear interpolation constraint. We show that this\noptimization problem has an equivalent linearly constrained optimization with\nstructured regularization that encourages approximately low rank solutions.\nSpecializing to neural network structure, we gain insights into how the\nfeatures and thus the kernel function change, providing additional nuance to\nthe phenomenon of kernel alignment when the target function is poorly\nrepresented using tangent features. In addition to verifying our theoretical\nobservations in real neural networks on a simple regression problem, we\nempirically show that an adaptive feature implementation of tangent feature\nclassification has an order of magnitude lower sample complexity than the fixed\ntangent feature model on MNIST and CIFAR-10.\n","authors":["Daniel LeJeune","Sina Alemohammad"],"pdf_url":"https://arxiv.org/pdf/2308.15478v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14120v2","updated":"2023-08-29T17:52:02Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this\ngap and perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT ADA without specific guidance. ChatGPT ADA autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT ADA offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15470v1","updated":"2023-08-29T17:50:27Z","published":"2023-08-29T17:50:27Z","title":"Policy composition in reinforcement learning via multi-objective policy\n optimization","summary":" We enable reinforcement learning agents to learn successful behavior policies\nby utilizing relevant pre-existing teacher policies. The teacher policies are\nintroduced as objectives, in addition to the task objective, in a\nmulti-objective policy optimization setting. Using the Multi-Objective Maximum\na Posteriori Policy Optimization algorithm\n\\citep{abdolmaleki2020distributional}, we show that teacher policies can help\nspeed up learning, particularly in the absence of shaping rewards. In two\ndomains with continuous observation and action spaces, our agents successfully\ncompose teacher policies in sequence and in parallel, and are also able to\nfurther extend the policies of the teachers in order to solve the task.\n Depending on the specified combination of task and teacher(s), teacher(s) may\nnaturally act to limit the final performance of an agent. The extent to which\nagents are required to adhere to teacher policies are determined by\nhyperparameters which determine both the effect of teachers on learning speed\nand the eventual performance of the agent on the task. In the {\\tt humanoid}\ndomain \\citep{deepmindcontrolsuite2018}, we also equip agents with the ability\nto control the selection of teachers. With this ability, agents are able to\nmeaningfully compose from the teacher policies to achieve a superior task\nreward on the {\\tt walk} task than in cases without access to the teacher\npolicies. We show the resemblance of composed task policies with the\ncorresponding teacher policies through videos.\n","authors":["Shruti Mishra","Ankit Anand","Jordan Hoffmann","Nicolas Heess","Martin Riedmiller","Abbas Abdolmaleki","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2308.15470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15466v1","updated":"2023-08-29T17:47:42Z","published":"2023-08-29T17:47:42Z","title":"Input margins can predict generalization too","summary":" Understanding generalization in deep neural networks is an active area of\nresearch. A promising avenue of exploration has been that of margin\nmeasurements: the shortest distance to the decision boundary for a given sample\nor its representation internal to the network. While margins have been shown to\nbe correlated with the generalization ability of a model when measured at its\nhidden representations (hidden margins), no such link between large margins and\ngeneralization has been established for input margins. We show that while input\nmargins are not generally predictive of generalization, they can be if the\nsearch space is appropriately constrained. We develop such a measure based on\ninput margins, which we refer to as `constrained margins'. The predictive power\nof this new measure is demonstrated on the 'Predicting Generalization in Deep\nLearning' (PGDL) dataset and contrasted with hidden representation margins. We\nfind that constrained margins achieve highly competitive scores and outperform\nother margin measurements in general. This provides a novel insight on the\nrelationship between generalization and classification margins, and highlights\nthe importance of considering the data manifold for investigations of\ngeneralization in DNNs.\n","authors":["Coenraad Mouton","Marthinus W. Theunissen","Marelie H. Davel"],"pdf_url":"https://arxiv.org/pdf/2308.15466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15464v1","updated":"2023-08-29T17:44:02Z","published":"2023-08-29T17:44:02Z","title":"A Comparative Study of Loss Functions: Traffic Predictions in Regular\n and Congestion Scenarios","summary":" Spatiotemporal graph neural networks have achieved state-of-the-art\nperformance in traffic forecasting. However, they often struggle to forecast\ncongestion accurately due to the limitations of traditional loss functions.\nWhile accurate forecasting of regular traffic conditions is crucial, a reliable\nAI system must also accurately forecast congestion scenarios to maintain safe\nand efficient transportation. In this paper, we explore various loss functions\ninspired by heavy tail analysis and imbalanced classification problems to\naddress this issue. We evaluate the efficacy of these loss functions in\nforecasting traffic speed, with an emphasis on congestion scenarios. Through\nextensive experiments on real-world traffic datasets, we discovered that when\noptimizing for Mean Absolute Error (MAE), the MAE-Focal Loss function stands\nout as the most effective. When optimizing Mean Squared Error (MSE), Gumbel\nLoss proves to be the superior choice. These choices effectively forecast\ntraffic congestion events without compromising the accuracy of regular traffic\nspeed forecasts. This research enhances deep learning models' capabilities in\nforecasting sudden speed changes due to congestion and underscores the need for\nmore research in this direction. By elevating the accuracy of congestion\nforecasting, we advocate for AI systems that are reliable, secure, and\nresilient in practical traffic management scenarios.\n","authors":["Yangxinyu Xie","Tanwi Mallick"],"pdf_url":"https://arxiv.org/pdf/2308.15464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13803v2","updated":"2023-08-29T17:38:45Z","published":"2023-01-31T17:44:59Z","title":"Fairness-aware Vision Transformer via Debiased Self-Attention","summary":" Vision Transformer (ViT) has recently gained significant interest in solving\ncomputer vision (CV) problems due to its capability of extracting informative\nfeatures and modeling long-range dependencies through the self-attention\nmechanism. To fully realize the advantages of ViT in real-world applications,\nrecent works have explored the trustworthiness of ViT, including its robustness\nand explainability. However, another desiderata, fairness has not yet been\nadequately addressed in the literature. We establish that the existing\nfairness-aware algorithms (primarily designed for CNNs) do not perform well on\nViT. This necessitates the need for developing our novel framework via Debiased\nSelf-Attention (DSA). DSA is a fairness-through-blindness approach that\nenforces ViT to eliminate spurious features correlated with the sensitive\nattributes for bias mitigation. Notably, adversarial examples are leveraged to\nlocate and mask the spurious features in the input image patches. In addition,\nDSA utilizes an attention weights alignment regularizer in the training\nobjective to encourage learning informative features for target prediction.\nImportantly, our DSA framework leads to improved fairness guarantees over prior\nworks on multiple prediction tasks without compromising target prediction\nperformance.\n","authors":["Yao Qiang","Chengyin Li","Prashant Khanduri","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2301.13803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15461v1","updated":"2023-08-29T17:38:33Z","published":"2023-08-29T17:38:33Z","title":"Canonical Factors for Hybrid Neural Fields","summary":" Factored feature volumes offer a simple way to build more compact, efficient,\nand intepretable neural fields, but also introduce biases that are not\nnecessarily beneficial for real-world data. In this work, we (1) characterize\nthe undesirable biases that these architectures have for axis-aligned signals\n-- they can lead to radiance field reconstruction differences of as high as 2\nPSNR -- and (2) explore how learning a set of canonicalizing transformations\ncan improve representations by removing these biases. We prove in a\ntwo-dimensional model problem that simultaneously learning these\ntransformations together with scene appearance succeeds with drastically\nimproved efficiency. We validate the resulting architectures, which we call\nTILTED, using image, signed distance, and radiance field reconstruction tasks,\nwhere we observe improvements across quality, robustness, compactness, and\nruntime. Results demonstrate that TILTED can enable capabilities comparable to\nbaselines that are 2x larger, while highlighting weaknesses of neural field\nevaluation procedures.\n","authors":["Brent Yi","Weijia Zeng","Sam Buchanan","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2308.15461v1.pdf","comment":"ICCV 2023. Project webpage: https://brentyi.github.io/tilted/"},{"id":"http://arxiv.org/abs/2308.15457v1","updated":"2023-08-29T17:31:26Z","published":"2023-08-29T17:31:26Z","title":"From SMOTE to Mixup for Deep Imbalanced Classification","summary":" Given imbalanced data, it is hard to train a good classifier using deep\nlearning because of the poor generalization of minority classes. Traditionally,\nthe well-known synthetic minority oversampling technique (SMOTE) for data\naugmentation, a data mining approach for imbalanced learning, has been used to\nimprove this generalization. However, it is unclear whether SMOTE also benefits\ndeep learning. In this work, we study why the original SMOTE is insufficient\nfor deep learning, and enhance SMOTE using soft labels. Connecting the\nresulting soft SMOTE with Mixup, a modern data augmentation technique, leads to\na unified framework that puts traditional and modern data augmentation\ntechniques under the same umbrella. A careful study within this framework shows\nthat Mixup improves generalization by implicitly achieving uneven margins\nbetween majority and minority classes. We then propose a novel margin-aware\nMixup technique that more explicitly achieves uneven margins. Extensive\nexperimental results demonstrate that our proposed technique yields\nstate-of-the-art performance on deep imbalanced classification while achieving\nsuperior performance on extremely imbalanced data. The code is open-sourced in\nour developed package https://github.com/ntucllab/imbalanced-DL to foster\nfuture research in this direction.\n","authors":["Wei-Chao Cheng","Tan-Ha Mai","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2308.15457v1.pdf","comment":"25 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.15452v1","updated":"2023-08-29T17:22:39Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2306.08018v2","updated":"2023-08-29T17:13:05Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a meticulously curated, comprehensive\ninstruction dataset expressly designed for the biomolecular realm.\nMol-Instructions is composed of three pivotal components: molecule-oriented\ninstructions, protein-oriented instructions, and biomolecular text\ninstructions, each curated to enhance the understanding and prediction\ncapabilities of LLMs concerning biomolecular features and behaviors. Through\nextensive instruction tuning experiments on the representative LLM, we\nunderscore the potency of Mol-Instructions to enhance the adaptability and\ncognitive acuity of large models within the complex sphere of biomolecular\nstudies, thereby promoting advancements in the biomolecular research community.\nMol-Instructions is made publicly accessible for future research endeavors and\nwill be subjected to continual updates for enhanced applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v2.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions. Add\n quantitative evaluations"},{"id":"http://arxiv.org/abs/2112.09153v2","updated":"2023-08-29T17:04:19Z","published":"2021-12-16T19:00:55Z","title":"An Empirical Investigation of the Role of Pre-training in Lifelong\n Learning","summary":" The lifelong learning paradigm in machine learning is an attractive\nalternative to the more prominent isolated learning scheme not only due to its\nresemblance to biological learning but also its potential to reduce energy\nwaste by obviating excessive model re-training. A key challenge to this\nparadigm is the phenomenon of catastrophic forgetting. With the increasing\npopularity and success of pre-trained models in machine learning, we pose the\nquestion: What role does pre-training play in lifelong learning, specifically\nwith respect to catastrophic forgetting? We investigate existing methods in the\ncontext of large, pre-trained models and evaluate their performance on a\nvariety of text and image classification tasks, including a large-scale study\nusing a novel data set of 15 diverse NLP tasks. Across all settings, we observe\nthat generic pre-training implicitly alleviates the effects of catastrophic\nforgetting when learning multiple tasks sequentially compared to randomly\ninitialized models. We then further investigate why pre-training alleviates\nforgetting in this setting. We study this phenomenon by analyzing the loss\nlandscape, finding that pre-trained weights appear to ease forgetting by\nleading to wider minima. Based on this insight, we propose jointly optimizing\nfor current task loss and loss basin sharpness to explicitly encourage wider\nbasins during sequential fine-tuning. We show that this optimization approach\noutperforms several state-of-the-art task-sequential continual learning\nalgorithms across multiple settings, occasionally even without retaining a\nmemory that scales in size with the number of tasks.\n","authors":["Sanket Vaibhav Mehta","Darshan Patil","Sarath Chandar","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2112.09153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15434v1","updated":"2023-08-29T16:56:03Z","published":"2023-08-29T16:56:03Z","title":"Random feature approximation for general spectral methods","summary":" Random feature approximation is arguably one of the most popular techniques\nto speed up kernel methods in large scale algorithms and provides a theoretical\napproach to the analysis of deep neural networks. We analyze generalization\nproperties for a large class of spectral regularization methods combined with\nrandom features, containing kernel methods with implicit regularization such as\ngradient descent or explicit methods like Tikhonov regularization. For our\nestimators we obtain optimal learning rates over regularity classes (even for\nclasses that are not included in the reproducing kernel Hilbert space), which\nare defined through appropriate source conditions. This improves or completes\nprevious results obtained in related settings for specific kernel algorithms.\n","authors":["Mike Nguyen","Nicole Mücke"],"pdf_url":"https://arxiv.org/pdf/2308.15434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15410v1","updated":"2023-08-29T16:10:20Z","published":"2023-08-29T16:10:20Z","title":"Probabilistic solar flare forecasting using historical magnetogram data","summary":" Solar flare forecasting research using machine learning (ML) has focused on\nhigh resolution magnetogram data from the SDO/HMI era covering Solar Cycle 24\nand the start of Solar Cycle 25, with some efforts looking back to SOHO/MDI for\ndata from Solar Cycle 23. In this paper, we consider over 4 solar cycles of\ndaily historical magnetogram data from multiple instruments. This is the first\nattempt to take advantage of this historical data for ML-based flare\nforecasting. We apply a convolutional neural network (CNN) to extract features\nfrom full-disk magnetograms together with a logistic regression model to\nincorporate scalar features based on magnetograms and flaring history. We use\nan ensemble approach to generate calibrated probabilistic forecasts of M-class\nor larger flares in the next 24 hours. Overall, we find that including\nhistorical data improves forecasting skill and reliability. We show that single\nframe magnetograms do not contain significantly more relevant information than\ncan be summarized in a small number of scalar features, and that flaring\nhistory has greater predictive power than our CNN-extracted features. This\nindicates the importance of including temporal information in flare forecasting\nmodels.\n","authors":["Kiera van der Sande","Andrés Muñoz-Jaramillo","Subhamoy Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2308.15410v1.pdf","comment":"22 pages, 16 figures, accepted to ApJ"},{"id":"http://arxiv.org/abs/2308.15405v1","updated":"2023-08-29T16:07:18Z","published":"2023-08-29T16:07:18Z","title":"Robust Long-Tailed Learning via Label-Aware Bounded CVaR","summary":" Data in the real-world classification problems are always imbalanced or\nlong-tailed, wherein the majority classes have the most of the samples that\ndominate the model training. In such setting, the naive model tends to have\npoor performance on the minority classes. Previously, a variety of loss\nmodifications have been proposed to address the long-tailed leaning problem,\nwhile these methods either treat the samples in the same class\nindiscriminatingly or lack a theoretical guarantee. In this paper, we propose\ntwo novel approaches based on CVaR (Conditional Value at Risk) to improve the\nperformance of long-tailed learning with a solid theoretical ground.\nSpecifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss\nto overcome the pessimistic result of the original CVaR, and further design the\noptimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we\nadditionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to\nstabilize the optimization process, where we also offer the theoretical\nsupport. Extensive experiments on real-world datasets with long-tailed label\ndistributions verify the superiority of our proposed methods.\n","authors":["Hong Zhu","Runpeng Yu","Xing Tang","Yifei Wang","Yuan Fang","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12896v2","updated":"2023-08-29T15:57:02Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v2.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.15395v1","updated":"2023-08-29T15:54:15Z","published":"2023-08-29T15:54:15Z","title":"The CausalBench challenge: A machine learning contest for gene network\n inference from single-cell perturbation data","summary":" In drug discovery, mapping interactions between genes within cellular systems\nis a crucial early step. This helps formulate hypotheses regarding molecular\nmechanisms that could potentially be targeted by future medicines. The\nCausalBench Challenge was an initiative to invite the machine learning\ncommunity to advance the state of the art in constructing gene-gene interaction\nnetworks. These networks, derived from large-scale, real-world datasets of\nsingle cells under various perturbations, are crucial for understanding the\ncausal mechanisms underlying disease biology. Using the framework provided by\nthe CausalBench benchmark, participants were tasked with enhancing the capacity\nof the state of the art methods to leverage large-scale genetic perturbation\ndata. This report provides an analysis and summary of the methods submitted\nduring the challenge to give a partial image of the state of the art at the\ntime of the challenge. The winning solutions significantly improved performance\ncompared to previous baselines, establishing a new state of the art for this\ncritical task in biology and medicine.\n","authors":["Mathieu Chevalley","Jacob Sackett-Sanders","Yusuf Roohani","Pascal Notin","Artemy Bakulin","Dariusz Brzezinski","Kaiwen Deng","Yuanfang Guan","Justin Hong","Michael Ibrahim","Wojciech Kotlowski","Marcin Kowiel","Panagiotis Misiakos","Achille Nazaret","Markus Püschel","Chris Wendler","Arash Mehrjou","Patrick Schwab"],"pdf_url":"https://arxiv.org/pdf/2308.15395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03854v4","updated":"2023-08-29T15:51:05Z","published":"2023-07-07T22:00:31Z","title":"inTformer: A Time-Embedded Attention-Based Transformer for Crash\n Likelihood Prediction at Intersections Using Connected Vehicle Data","summary":" The real-time crash likelihood prediction model is an essential component of\nthe proactive traffic safety management system. Over the years, numerous\nstudies have attempted to construct a crash likelihood prediction model in\norder to enhance traffic safety, but mostly on freeways. In the majority of the\nexisting studies, researchers have primarily employed a deep learning-based\nframework to identify crash potential. Lately, Transformer has emerged as a\npotential deep neural network that fundamentally operates through\nattention-based mechanisms. Transformer has several functional benefits over\nextant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can\nreadily handle long-term dependencies in a data sequence. Secondly,\nTransformers can parallelly process all elements in a data sequence during\ntraining. Finally, a Transformer does not have the vanishing gradient issue.\nRealizing the immense possibility of Transformers, this paper proposes\ninTersection-Transformer (inTformer), a time-embedded attention-based\nTransformer model that can effectively predict intersection crash likelihood in\nreal-time. The proposed model was evaluated using connected vehicle data\nextracted from Signal Analytics Platform. Acknowledging the complex traffic\noperation mechanism at intersection, this study developed zone-specific models\nby dividing the intersection region into two distinct zones:\nwithin-intersection and approach zone. The best inTformer models in\n'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and\n70%, respectively. The zone-level models were also compared to earlier studies\non crash likelihood prediction at intersections and with several established\ndeep learning models trained on the same connected vehicle dataset.\n","authors":["B M Tazbiul Hassan Anik","Zubayer Islam","Mohamed Abdel-Aty"],"pdf_url":"https://arxiv.org/pdf/2307.03854v4.pdf","comment":"29 pages, 10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2308.15394v1","updated":"2023-08-29T15:48:49Z","published":"2023-08-29T15:48:49Z","title":"Decentralized Multi-agent Reinforcement Learning based State-of-Charge\n Balancing Strategy for Distributed Energy Storage System","summary":" This paper develops a Decentralized Multi-Agent Reinforcement Learning\n(Dec-MARL) method to solve the SoC balancing problem in the distributed energy\nstorage system (DESS). First, the SoC balancing problem is formulated into a\nfinite Markov decision process with action constraints derived from demand\nbalance, which can be solved by Dec-MARL. Specifically, the first-order average\nconsensus algorithm is utilized to expand the observations of the DESS state in\na fully-decentralized way, and the initial actions (i.e., output power) are\ndecided by the agents (i.e., energy storage units) according to these\nobservations. In order to get the final actions in the allowable range, a\ncounterfactual demand balance algorithm is proposed to balance the total demand\nand the initial actions. Next, the agents execute the final actions and get\nlocal rewards from the environment, and the DESS steps into the next state.\nFinally, through the first-order average consensus algorithm, the agents get\nthe average reward and the expended observation of the next state for later\ntraining. By the above procedure, Dec-MARL reveals outstanding performance in a\nfully-decentralized system without any expert experience or constructing any\ncomplicated model. Besides, it is flexible and can be extended to other\ndecentralized multi-agent systems straightforwardly. Extensive simulations have\nvalidated the effectiveness and efficiency of Dec-MARL.\n","authors":["Zheng Xiong","Biao Luo","Bing-Chuan Wang","Xiaodong Xu","Xiaodong Liu","Tingwen Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01351v3","updated":"2023-08-29T15:42:35Z","published":"2022-12-02T18:13:48Z","title":"A Bayesian Framework for Digital Twin-Based Control, Monitoring, and\n Data Collection in Wireless Systems","summary":" Commonly adopted in the manufacturing and aerospace sectors, digital twin\n(DT) platforms are increasingly seen as a promising paradigm to control,\nmonitor, and analyze software-based, \"open\", communication systems. Notably, DT\nplatforms provide a sandbox in which to test artificial intelligence (AI)\nsolutions for communication systems, potentially reducing the need to collect\ndata and test algorithms in the field, i.e., on the physical twin (PT). A key\nchallenge in the deployment of DT systems is to ensure that virtual control\noptimization, monitoring, and analysis at the DT are safe and reliable,\navoiding incorrect decisions caused by \"model exploitation\". To address this\nchallenge, this paper presents a general Bayesian framework with the aim of\nquantifying and accounting for model uncertainty at the DT that is caused by\nlimitations in the amount and quality of data available at the DT from the PT.\nIn the proposed framework, the DT builds a Bayesian model of the communication\nsystem, which is leveraged to enable core DT functionalities such as control\nvia multi-agent reinforcement learning (MARL), monitoring of the PT for anomaly\ndetection, prediction, data-collection optimization, and counterfactual\nanalysis. To exemplify the application of the proposed framework, we\nspecifically investigate a case-study system encompassing multiple sensing\ndevices that report to a common receiver. Experimental results validate the\neffectiveness of the proposed Bayesian framework as compared to standard\nfrequentist model-based solutions.\n","authors":["Clement Ruah","Osvaldo Simeone","Bashir Al-Hashimi"],"pdf_url":"https://arxiv.org/pdf/2212.01351v3.pdf","comment":"Accepted for publication in IEEE Journal on Selected Areas in\n Communications ; Extends and subsumes arXiv:2210.05582 ; Updates: -\n 18/01/2023: Updated reference ; - 29/08/2023: Revised manuscript version"},{"id":"http://arxiv.org/abs/2306.10033v2","updated":"2023-08-29T15:39:32Z","published":"2023-06-07T19:40:37Z","title":"Investigating Reproducibility at Interspeech Conferences: A Longitudinal\n and Comparative Perspective","summary":" Reproducibility is a key aspect for scientific advancement across\ndisciplines, and reducing barriers for open science is a focus area for the\ntheme of Interspeech 2023. Availability of source code is one of the indicators\nthat facilitates reproducibility. However, less is known about the rates of\nreproducibility at Interspeech conferences in comparison to other conferences\nin the field. In order to fill this gap, we have surveyed 27,717 papers at\nseven conferences across speech and language processing disciplines. We find\nthat despite having a close number of accepted papers to the other conferences,\nInterspeech has up to 40% less source code availability. In addition to\nreporting the difficulties we have encountered during our research, we also\nprovide recommendations and possible directions to increase reproducibility for\nfurther studies.\n","authors":["Mohammad Arvan","A. Seza Doğruöz","Natalie Parde"],"pdf_url":"https://arxiv.org/pdf/2306.10033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15386v1","updated":"2023-08-29T15:29:06Z","published":"2023-08-29T15:29:06Z","title":"Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation\n and Diagnosis","summary":" Thyroid nodule segmentation is a crucial step in the diagnostic procedure of\nphysicians and computer-aided diagnosis systems. Mostly, current studies treat\nsegmentation and diagnosis as independent tasks without considering the\ncorrelation between these tasks. The sequence steps of these independent tasks\nin computer-aided diagnosis systems may lead to the accumulation of errors.\nTherefore, it is worth combining them as a whole through exploring the\nrelationship between thyroid nodule segmentation and diagnosis. According to\nthe thyroid imaging reporting and data system (TI-RADS), the assessment of\nshape and margin characteristics is the prerequisite for the discrimination of\nbenign and malignant thyroid nodules. These characteristics can be observed in\nthe thyroid nodule segmentation masks. Inspired by the diagnostic procedure of\nTI-RADS, this paper proposes a shape-margin knowledge augmented network\n(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to\nthe similarity in visual features between segmentation and diagnosis, SkaNet\nshares visual features in the feature extraction stage and then utilizes a\ndual-branch architecture to perform thyroid nodule segmentation and diagnosis\ntasks simultaneously. To enhance effective discriminative features, an\nexponential mixture module is devised, which incorporates convolutional feature\nmaps and self-attention maps by exponential weighting. Then, SkaNet is jointly\noptimized by a knowledge augmented multi-task loss function with a constraint\npenalty term. It embeds shape and margin characteristics through numerical\ncomputation and models the relationship between the thyroid nodule diagnosis\nresults and segmentation masks.\n","authors":["Weihua Liu","Chaochao Lin"],"pdf_url":"https://arxiv.org/pdf/2308.15386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15370v1","updated":"2023-08-29T15:06:47Z","published":"2023-08-29T15:06:47Z","title":"Multi-Response Heteroscedastic Gaussian Process Models and Their\n Inference","summary":" Despite the widespread utilization of Gaussian process models for versatile\nnonparametric modeling, they exhibit limitations in effectively capturing\nabrupt changes in function smoothness and accommodating relationships with\nheteroscedastic errors. Addressing these shortcomings, the heteroscedastic\nGaussian process (HeGP) regression seeks to introduce flexibility by\nacknowledging the variability of residual variances across covariates in the\nregression model. In this work, we extend the HeGP concept, expanding its scope\nbeyond regression tasks to encompass classification and state-space models. To\nachieve this, we propose a novel framework where the Gaussian process is\ncoupled with a covariate-induced precision matrix process, adopting a mixture\nformulation. This approach enables the modeling of heteroscedastic covariance\nfunctions across covariates. To mitigate the computational challenges posed by\nsampling, we employ variational inference to approximate the posterior and\nfacilitate posterior predictive modeling. Additionally, our training process\nleverages an EM algorithm featuring closed-form M-step updates to efficiently\nevaluate the heteroscedastic covariance function. A notable feature of our\nmodel is its consistent performance on multivariate responses, accommodating\nvarious types (continuous or categorical) seamlessly. Through a combination of\nsimulations and real-world applications in climatology, we illustrate the\nmodel's prowess and advantages. By overcoming the limitations of traditional\nGaussian process models, our proposed framework offers a robust and versatile\ntool for a wide array of applications.\n","authors":["Taehee Lee","Jun S. Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15370v1.pdf","comment":"submitted to the Journal of the American Statistical Association\n (JASA)"},{"id":"http://arxiv.org/abs/2308.15367v1","updated":"2023-08-29T15:03:05Z","published":"2023-08-29T15:03:05Z","title":"Efficient Model Personalization in Federated Learning via\n Client-Specific Prompt Generation","summary":" Federated learning (FL) emerges as a decentralized learning framework which\ntrains models from multiple distributed clients without sharing their data to\npreserve privacy. Recently, large-scale pre-trained models (e.g., Vision\nTransformer) have shown a strong capability of deriving robust representations.\nHowever, the data heterogeneity among clients, the limited computation\nresources, and the communication bandwidth restrict the deployment of\nlarge-scale models in FL frameworks. To leverage robust representations from\nlarge-scale models while enabling efficient model personalization for\nheterogeneous clients, we propose a novel personalized FL framework of\nclient-specific Prompt Generation (pFedPG), which learns to deploy a\npersonalized prompt generator at the server for producing client-specific\nvisual prompts that efficiently adapts frozen backbones to local data\ndistributions. Our proposed framework jointly optimizes the stages of\npersonalized prompt adaptation locally and personalized prompt generation\nglobally. The former aims to train visual prompts that adapt foundation models\nto each client, while the latter observes local optimization directions to\ngenerate personalized prompts for all clients. Through extensive experiments on\nbenchmark datasets, we show that our pFedPG is favorable against\nstate-of-the-art personalized FL methods under various types of data\nheterogeneity, allowing computation and communication efficient model\npersonalization.\n","authors":["Fu-En Yang","Chien-Yi Wang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15367v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15364v1","updated":"2023-08-29T15:01:01Z","published":"2023-08-29T15:01:01Z","title":"Heterogeneous Multi-Task Gaussian Cox Processes","summary":" This paper presents a novel extension of multi-task Gaussian Cox processes\nfor modeling multiple heterogeneous correlated tasks jointly, e.g.,\nclassification and regression, via multi-output Gaussian processes (MOGP). A\nMOGP prior over the parameters of the dedicated likelihoods for classification,\nregression and point process tasks can facilitate sharing of information\nbetween heterogeneous tasks, while allowing for nonparametric parameter\nestimation. To circumvent the non-conjugate Bayesian inference in the MOGP\nmodulated heterogeneous multi-task framework, we employ the data augmentation\ntechnique and derive a mean-field approximation to realize closed-form\niterative updates for estimating model parameters. We demonstrate the\nperformance and inference on both 1D synthetic data as well as 2D urban data of\nVancouver.\n","authors":["Feng Zhou","Quyu Kong","Zhijie Deng","Fengxiang He","Peng Cui","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.15364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15363v1","updated":"2023-08-29T14:59:54Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborates their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar.\nTowards an efficient and economic LLM-based Text-to-SQL solution, we emphasize\nthe token efficiency in prompt engineering and compare the prior studies under\nthis metric. Additionally, we investigate open-source LLMs in in-context\nlearning, and further enhance their performance with task-specific supervised\nfine-tuning. Our explorations highlight open-source LLMs' potential in\nText-to-SQL, as well as the advantages and disadvantages of the task-specific\nsupervised fine-tuning. We hope that our work provides a deeper understanding\nof Text-to-SQL with LLMs, and inspire further investigations and broad\napplications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v1.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2305.14594v2","updated":"2023-08-29T14:51:08Z","published":"2023-05-24T00:20:59Z","title":"torchgfn: A PyTorch GFlowNet library","summary":" The growing popularity of generative flow networks (GFlowNets or GFNs) from a\nrange of researchers with diverse backgrounds and areas of expertise\nnecessitates a library which facilitates the testing of new features such as\ntraining losses that can be easily compared to standard benchmark\nimplementations, or on a set of common environments. torchgfn is a PyTorch\nlibrary that aims to address this need. It provides users with a simple API for\nenvironments and useful abstractions for samplers and losses. Multiple examples\nare provided, replicating and unifying published results. The code is available\nin https://github.com/saleml/torchgfn.\n","authors":["Salem Lahlou","Joseph D. Viviano","Victor Schmidt","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2305.14594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15349v1","updated":"2023-08-29T14:45:23Z","published":"2023-08-29T14:45:23Z","title":"Lie-Poisson Neural Networks (LPNets): Data-Based Computing of\n Hamiltonian Systems with Symmetries","summary":" An accurate data-based prediction of the long-term evolution of Hamiltonian\nsystems requires a network that preserves the appropriate structure under each\ntime step. Every Hamiltonian system contains two essential ingredients: the\nPoisson bracket and the Hamiltonian. Hamiltonian systems with symmetries, whose\nparadigm examples are the Lie-Poisson systems, have been shown to describe a\nbroad category of physical phenomena, from satellite motion to underwater\nvehicles, fluids, geophysical applications, complex fluids, and plasma physics.\nThe Poisson bracket in these systems comes from the symmetries, while the\nHamiltonian comes from the underlying physics. We view the symmetry of the\nsystem as primary, hence the Lie-Poisson bracket is known exactly, whereas the\nHamiltonian is regarded as coming from physics and is considered not known, or\nknown approximately. Using this approach, we develop a network based on\ntransformations that exactly preserve the Poisson bracket and the special\nfunctions of the Lie-Poisson systems (Casimirs) to machine precision. We\npresent two flavors of such systems: one, where the parameters of\ntransformations are computed from data using a dense neural network (LPNets),\nand another, where the composition of transformations is used as building\nblocks (G-LPNets). We also show how to adapt these methods to a larger class of\nPoisson brackets. We apply the resulting methods to several examples, such as\nrigid body (satellite) motion, underwater vehicles, a particle in a magnetic\nfield, and others. The methods developed in this paper are important for the\nconstruction of accurate data-based methods for simulating the long-term\ndynamics of physical systems.\n","authors":["Christopher Eldred","François Gay-Balmaz","Sofiia Huraka","Vakhtang Putkaradze"],"pdf_url":"https://arxiv.org/pdf/2308.15349v1.pdf","comment":"57 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.15344v1","updated":"2023-08-29T14:41:05Z","published":"2023-08-29T14:41:05Z","title":"Imperceptible Adversarial Attack on Deep Neural Networks from Image\n Boundary","summary":" Although Deep Neural Networks (DNNs), such as the convolutional neural\nnetworks (CNN) and Vision Transformers (ViTs), have been successfully applied\nin the field of computer vision, they are demonstrated to be vulnerable to\nwell-sought Adversarial Examples (AEs) that can easily fool the DNNs. The\nresearch in AEs has been active, and many adversarial attacks and explanations\nhave been proposed since they were discovered in 2014. The mystery of the AE's\nexistence is still an open question, and many studies suggest that DNN training\nalgorithms have blind spots. The salient objects usually do not overlap with\nboundaries; hence, the boundaries are not the DNN model's attention.\nNevertheless, recent studies show that the boundaries can dominate the behavior\nof the DNN models. Hence, this study aims to look at the AEs from a different\nperspective and proposes an imperceptible adversarial attack that systemically\nattacks the input image boundary for finding the AEs. The experimental results\nhave shown that the proposed boundary attacking method effectively attacks six\nCNN models and the ViT using only 32% of the input image content (from the\nboundaries) with an average success rate (SR) of 95.2% and an average peak\nsignal-to-noise ratio of 41.37 dB. Correlation analyses are conducted,\nincluding the relation between the adversarial boundary's width and the SR and\nhow the adversarial boundary changes the DNN model's attention. This paper's\ndiscoveries can potentially advance the understanding of AEs and provide a\ndifferent perspective on how AEs can be constructed.\n","authors":["Fahad Alrasheedi","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.15344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.10184v3","updated":"2023-08-29T14:36:29Z","published":"2022-10-18T22:12:29Z","title":"Application Performance Modeling via Tensor Completion","summary":" Performance tuning, software/hardware co-design, and job scheduling are among\nthe many tasks that rely on models to predict application performance. We\npropose and evaluate low-rank tensor decomposition for modeling application\nperformance. We discretize the input and configuration domains of an\napplication using regular grids. Application execution times mapped within\ngrid-cells are averaged and represented by tensor elements. We show that\nlow-rank canonical-polyadic (CP) tensor decomposition is effective in\napproximating these tensors. We further show that this decomposition enables\naccurate extrapolation of unobserved regions of an application's parameter\nspace. We then employ tensor completion to optimize a CP decomposition given a\nsparse set of observed execution times. We consider alternative\npiecewise/grid-based models and supervised learning models for six applications\nand demonstrate that CP decomposition optimized using tensor completion offers\nhigher prediction accuracy and memory-efficiency for high-dimensional\nperformance modeling.\n","authors":["Edward Hutter","Edgar Solomonik"],"pdf_url":"https://arxiv.org/pdf/2210.10184v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06399v2","updated":"2023-08-29T14:27:58Z","published":"2023-08-11T21:46:45Z","title":"Learning Bayesian Networks with Heterogeneous Agronomic Data Sets via\n Mixed-Effect Models and Hierarchical Clustering","summary":" Research involving diverse but related data sets, where associations between\ncovariates and outcomes may vary, is prevalent in various fields including\nagronomic studies. In these scenarios, hierarchical models, also known as\nmultilevel models, are frequently employed to assimilate information from\ndifferent data sets while accommodating their distinct characteristics.\nHowever, their structure extend beyond simple heterogeneity, as variables often\nform complex networks of causal relationships.\n Bayesian networks (BNs) provide a powerful framework for modelling such\nrelationships using directed acyclic graphs to illustrate the connections\nbetween variables. This study introduces a novel approach that integrates\nrandom effects into BN learning. Rooted in linear mixed-effects models, this\napproach is particularly well-suited for handling hierarchical data. Results\nfrom a real-world agronomic trial suggest that employing this approach enhances\nstructural learning, leading to the discovery of new connections and the\nimprovement of improved model specification. Furthermore, we observe a\nreduction in prediction errors from 28% to 17%. By extending the applicability\nof BNs to complex data set structures, this approach contributes to the\neffective utilisation of BNs for hierarchical agronomic data. This, in turn,\nenhances their value as decision-support tools in the field.\n","authors":["Lorenzo Valleggi","Marco Scutari","Federico Mattia Stefanini"],"pdf_url":"https://arxiv.org/pdf/2308.06399v2.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.15327v1","updated":"2023-08-29T14:23:44Z","published":"2023-08-29T14:23:44Z","title":"Enhancing Robot Learning through Learned Human-Attention Feature Maps","summary":" Robust and efficient learning remains a challenging problem in robotics, in\nparticular with complex visual inputs. Inspired by human attention mechanism,\nwith which we quickly process complex visual scenes and react to changes in the\nenvironment, we think that embedding auxiliary information about focus point\ninto robot learning would enhance efficiency and robustness of the learning\nprocess. In this paper, we propose a novel approach to model and emulate the\nhuman attention with an approximate prediction model. We then leverage this\noutput and feed it as a structured auxiliary feature map into downstream\nlearning tasks. We validate this idea by learning a prediction model from\nhuman-gaze recordings of manual driving in the real world. We test our approach\non two learning tasks - object detection and imitation learning. Our\nexperiments demonstrate that the inclusion of predicted human attention leads\nto improved robustness of the trained models to out-of-distribution samples and\nfaster learning in low-data regime settings. Our work highlights the potential\nof incorporating structured auxiliary information in representation learning\nfor robotics and opens up new avenues for research in this direction. All code\nand data are available online.\n","authors":["Daniel Scheuchenstuhl","Stefan Ulmer","Felix Resch","Luigi Berducci","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2308.15327v1.pdf","comment":"This work has been accepted for the RAP4Robots workshop at ICRA 2023\n in London"},{"id":"http://arxiv.org/abs/2308.15323v1","updated":"2023-08-29T14:20:13Z","published":"2023-08-29T14:20:13Z","title":"Occlusion-Aware Deep Convolutional Neural Network via Homogeneous\n Tanh-transforms for Face Parsing","summary":" Face parsing infers a pixel-wise label map for each semantic facial\ncomponent. Previous methods generally work well for uncovered faces, however\noverlook the facial occlusion and ignore some contextual area outside a single\nface, especially when facial occlusion has become a common situation during the\nCOVID-19 epidemic. Inspired by the illumination theory of image, we propose a\nnovel homogeneous tanh-transforms for image preprocessing, which made up of\nfour tanh-transforms, that fuse the central vision and the peripheral vision\ntogether. Our proposed method addresses the dilemma of face parsing under\nocclusion and compresses more information of surrounding context. Based on\nhomogeneous tanh-transforms, we propose an occlusion-aware convolutional neural\nnetwork for occluded face parsing. It combines the information both in\nTanh-polar space and Tanh-Cartesian space, capable of enhancing receptive\nfields. Furthermore, we introduce an occlusion-aware loss to focus on the\nboundaries of occluded regions. The network is simple and flexible, and can be\ntrained end-to-end. To facilitate future research of occluded face parsing, we\nalso contribute a new cleaned face parsing dataset, which is manually purified\nfrom several academic or industrial datasets, including CelebAMask-HQ,\nShort-video Face Parsing as well as Helen dataset and will make it public.\nExperiments demonstrate that our method surpasses state-of-art methods of face\nparsing under occlusion.\n","authors":["Weihua Liu","Chaochao Lin","Haoping Yu","Said Boumaraf","Zhaoqiong Pi"],"pdf_url":"https://arxiv.org/pdf/2308.15323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v1","updated":"2023-08-29T14:16:09Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v1.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2303.07122v4","updated":"2023-08-29T14:07:12Z","published":"2023-02-22T19:35:28Z","title":"Quantifying Causes of Arctic Amplification via Deep Learning based\n Time-series Causal Inference","summary":" The warming of the Arctic, also known as Arctic amplification, is led by\nseveral atmospheric and oceanic drivers. However, the details of its underlying\nthermodynamic causes are still unknown. Inferring the causal effects of\natmospheric processes on sea ice melt using fixed treatment effect strategies\nleads to unrealistic counterfactual estimations. Such models are also prone to\nbias due to time-varying confoundedness. Further, the complex non-linearity in\nEarth science data makes it infeasible to perform causal inference using\nexisting marginal structural techniques. In order to tackle these challenges,\nwe propose TCINet - time-series causal inference model to infer causation under\ncontinuous treatment using recurrent neural networks and a novel probabilistic\nbalancing technique. Through experiments on synthetic and observational data,\nwe show how our research can substantially improve the ability to quantify\nleading causes of Arctic sea ice melt, further paving paths for causal\ninference in observational Earth science.\n","authors":["Sahara Ali","Omar Faruque","Yiyi Huang","Md. Osman Gani","Aneesh Subramanian","Nicole-Jienne Shchlegel","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2303.07122v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.01654v3","updated":"2023-08-29T13:58:43Z","published":"2020-10-04T18:56:34Z","title":"Bayesian Feature Selection in Joint Quantile Time Series Analysis","summary":" Quantile feature selection over correlated multivariate time series data has\nalways been a methodological challenge and is an open problem. In this paper,\nwe propose a general Bayesian dimension reduction methodology for feature\nselection in high-dimensional joint quantile time series analysis, under the\nname of the quantile feature selection time series (QFSTS) model. The QFSTS\nmodel is a general structural time series model, where each component yields an\nadditive contribution to the time series modeling with direct interpretations.\nIts flexibility is compound in the sense that users can add/deduct components\nfor each time series and each time series can have its own specific valued\ncomponents of different sizes. Feature selection is conducted in the quantile\nregression component, where each time series has its own pool of\ncontemporaneous external predictors allowing nowcasting. Bayesian methodology\nin extending feature selection to the quantile time series research area is\ndeveloped using multivariate asymmetric Laplace distribution, spike-and-slab\nprior setup, the Metropolis-Hastings algorithm, and the Bayesian model\naveraging technique, all implemented consistently in the Bayesian paradigm. The\nQFSTS model requires small datasets to train and converges fast. Extensive\nexaminations confirmed that the QFSTS model has superior performance in feature\nselection, parameter estimation, and forecast.\n","authors":["Ning Ning"],"pdf_url":"https://arxiv.org/pdf/2010.01654v3.pdf","comment":"Accepted to the Bayesian Analysis journal"},{"id":"http://arxiv.org/abs/2304.03981v3","updated":"2023-08-29T13:50:43Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n Identification","summary":" Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15308v1","updated":"2023-08-29T13:48:35Z","published":"2023-08-29T13:48:35Z","title":"On-Device Learning with Binary Neural Networks","summary":" Existing Continual Learning (CL) solutions only partially address the\nconstraints on power, memory and computation of the deep learning models when\ndeployed on low-power embedded CPUs. In this paper, we propose a CL solution\nthat embraces the recent advancements in CL field and the efficiency of the\nBinary Neural Networks (BNN), that use 1-bit for weights and activations to\nefficiently execute deep learning models. We propose a hybrid quantization of\nCWR* (an effective CL approach) that considers differently forward and backward\npass in order to retain more precision during gradient update step and at the\nsame time minimizing the latency overhead. The choice of a binary network as\nbackbone is essential to meet the constraints of low power devices and, to the\nbest of authors' knowledge, this is the first attempt to prove on-device\nlearning with BNN. The experimental validation carried out confirms the\nvalidity and the suitability of the proposed method.\n","authors":["Lorenzo Vorabbi","Davide Maltoni","Stefano Santi"],"pdf_url":"https://arxiv.org/pdf/2308.15308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.15584v2","updated":"2023-08-29T13:35:42Z","published":"2020-12-31T12:40:52Z","title":"Combinatorial Pure Exploration with Full-bandit Feedback and Beyond:\n Solving Combinatorial Optimization under Uncertainty with Limited Observation","summary":" Combinatorial optimization is one of the fundamental research fields that has\nbeen extensively studied in theoretical computer science and operations\nresearch. When developing an algorithm for combinatorial optimization, it is\ncommonly assumed that parameters such as edge weights are exactly known as\ninputs. However, this assumption may not be fulfilled since input parameters\nare often uncertain or initially unknown in many applications such as\nrecommender systems, crowdsourcing, communication networks, and online\nadvertisement. To resolve such uncertainty, the problem of combinatorial pure\nexploration of multi-armed bandits (CPE) and its variants have recieved\nincreasing attention. Earlier work on CPE has studied the semi-bandit feedback\nor assumed that the outcome from each individual edge is always accessible at\nall rounds. However, due to practical constraints such as a budget ceiling or\nprivacy concern, such strong feedback is not always available in recent\napplications. In this article, we review recently proposed techniques for\ncombinatorial pure exploration problems with limited feedback.\n","authors":["Yuko Kuroki","Junya Honda","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2012.15584v2.pdf","comment":"Preprint of an Invited Review Article, In Fields Institute"},{"id":"http://arxiv.org/abs/2306.11167v3","updated":"2023-08-29T13:33:52Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v3.pdf","comment":"V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption\n overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet\n results in Section 4.3 (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2209.04512v3","updated":"2023-08-29T13:33:49Z","published":"2022-09-09T20:29:54Z","title":"Deep Learning Based Residuals in Non-linear Factor Models: Precision\n Matrix Estimation of Returns with Low Signal-to-Noise Ratio","summary":" This paper introduces a consistent estimator and rate of convergence for the\nprecision matrix of asset returns in large portfolios using a non-linear factor\nmodel within the deep learning framework. Our estimator remains valid even in\nlow signal-to-noise ratio environments typical for financial markets and is\ncompatible with weak factors. Our theoretical analysis establishes uniform\nbounds on expected estimation risk based on deep neural networks for an\nexpanding number of assets. Additionally, we provide a new consistent\ndata-dependent estimator of error covariance in deep neural networks. Our\nmodels demonstrate superior accuracy in extensive simulations and the empirics.\n","authors":["Mehmet Caner","Maurizio Daniele"],"pdf_url":"https://arxiv.org/pdf/2209.04512v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15291v1","updated":"2023-08-29T13:25:26Z","published":"2023-08-29T13:25:26Z","title":"Towards quantitative precision for ECG analysis: Leveraging state space\n models, self-supervision and patient metadata","summary":" Deep learning has emerged as the preferred modeling approach for automatic\nECG analysis. In this study, we investigate three elements aimed at improving\nthe quantitative accuracy of such systems. These components consistently\nenhance performance beyond the existing state-of-the-art, which is\npredominantly based on convolutional models. Firstly, we explore more\nexpressive architectures by exploiting structured state space models (SSMs).\nThese models have shown promise in capturing long-term dependencies in time\nseries data. By incorporating SSMs into our approach, we not only achieve\nbetter performance, but also gain insights into long-standing questions in the\nfield. Specifically, for standard diagnostic tasks, we find no advantage in\nusing higher sampling rates such as 500Hz compared to 100Hz. Similarly,\nextending the input size of the model beyond 3 seconds does not lead to\nsignificant improvements. Secondly, we demonstrate that self-supervised\nlearning using contrastive predictive coding can further improve the\nperformance of SSMs. By leveraging self-supervision, we enable the model to\nlearn more robust and representative features, leading to improved analysis\naccuracy. Lastly, we depart from synthetic benchmarking scenarios and\nincorporate basic demographic metadata alongside the ECG signal as input. This\ninclusion of patient metadata departs from the conventional practice of relying\nsolely on the signal itself. Remarkably, this addition consistently yields\npositive effects on predictive performance. We firmly believe that all three\ncomponents should be considered when developing next-generation ECG analysis\nalgorithms.\n","authors":["Temesgen Mehari","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2308.15291v1.pdf","comment":"extended version of arXiv:2211.07579"},{"id":"http://arxiv.org/abs/2308.15283v1","updated":"2023-08-29T13:14:53Z","published":"2023-08-29T13:14:53Z","title":"Structural Node Embeddings with Homomorphism Counts","summary":" Graph homomorphism counts, first explored by Lov\\'asz in 1967, have recently\ngarnered interest as a powerful tool in graph-based machine learning. Grohe\n(PODS 2020) proposed the theoretical foundations for using homomorphism counts\nin machine learning on graph level as well as node level tasks. By their very\nnature, these capture local structural information, which enables the creation\nof robust structural embeddings. While a first approach for graph level tasks\nhas been made by Nguyen and Maehara (ICML 2020), we experimentally show the\neffectiveness of homomorphism count based node embeddings. Enriched with node\nlabels, node weights, and edge weights, these offer an interpretable\nrepresentation of graph data, allowing for enhanced explainability of machine\nlearning models.\n We propose a theoretical framework for isomorphism-invariant homomorphism\ncount based embeddings which lend themselves to a wide variety of downstream\ntasks. Our approach capitalises on the efficient computability of graph\nhomomorphism counts for bounded treewidth graph classes, rendering it a\npractical solution for real-world applications. We demonstrate their\nexpressivity through experiments on benchmark datasets. Although our results do\nnot match the accuracy of state-of-the-art neural architectures, they are\ncomparable to other advanced graph learning models. Remarkably, our approach\ndemarcates itself by ensuring explainability for each individual feature. By\nintegrating interpretable machine learning algorithms like SVMs or Random\nForests, we establish a seamless, end-to-end explainable pipeline. Our study\ncontributes to the advancement of graph-based techniques that offer both\nperformance and interpretability.\n","authors":["Hinrikus Wolf","Luca Oeljeklaus","Pascal Kühner","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2308.15283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07439v2","updated":"2023-08-29T12:57:17Z","published":"2023-07-14T16:04:03Z","title":"Atlas-Based Interpretable Age Prediction In Whole-Body MR Images","summary":" Age prediction is an important part of medical assessments and research. It\ncan aid in detecting diseases as well as abnormal ageing by highlighting the\ndiscrepancy between chronological and biological age. To gain a comprehensive\nunderstanding of age-related changes observed in various body parts, we\ninvestigate them on a larger scale by using whole-body images. We utilise the\nGrad-CAM interpretability method to determine the body areas most predictive of\na person's age. We expand our analysis beyond individual subjects by employing\nregistration techniques to generate population-wide interpretability maps.\nFurthermore, we set state-of-the-art whole-body age prediction with a model\nthat achieves a mean absolute error of 2.76 years. Our findings reveal three\nprimary areas of interest: the spine, the autochthonous back muscles, and the\ncardiac region, which exhibits the highest importance.\n","authors":["Sophie Starck","Yadunandan Vivekanand Kini","Jessica Johanna Maria Ritter","Rickmer Braren","Daniel Rueckert","Tamara Mueller"],"pdf_url":"https://arxiv.org/pdf/2307.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10167v3","updated":"2023-08-29T12:49:55Z","published":"2023-03-17T17:54:25Z","title":"Generalized partitioned local depth","summary":" In this paper we provide a generalization of the concept of cohesion as\nintroduced recently by Berenhaut, Moore and Melvin [Proceedings of the National\nAcademy of Sciences, 119 (4) (2022)]. The formulation presented builds on the\ntechnique of partitioned local depth by distilling two key probabilistic\nconcepts: local relevance and support division. Earlier results are extended\nwithin the new context, and examples of applications to revealing communities\nin data with uncertainty are included. The work sheds light on the foundations\nof partitioned local depth, and extends the original ideas to enable\nprobabilistic consideration of uncertain, variable and potentially conflicting\ninformation.\n","authors":["Kenneth S. Berenhaut","John D. Foley","Liangdongsheng Lyu"],"pdf_url":"https://arxiv.org/pdf/2303.10167v3.pdf","comment":"Improved exposition & motivation, references added, 19 pages, 6\n figures"},{"id":"http://arxiv.org/abs/2308.15256v1","updated":"2023-08-29T12:30:53Z","published":"2023-08-29T12:30:53Z","title":"Let There Be Sound: Reconstructing High Quality Speech from Silent\n Videos","summary":" The goal of this work is to reconstruct high quality speech from lip motions\nalone, a task also known as lip-to-speech. A key challenge of lip-to-speech\nsystems is the one-to-many mapping caused by (1) the existence of homophenes\nand (2) multiple speech variations, resulting in a mispronounced and\nover-smoothed speech. In this paper, we propose a novel lip-to-speech system\nthat significantly improves the generation quality by alleviating the\none-to-many mapping problem from multiple perspectives. Specifically, we\nincorporate (1) self-supervised speech representations to disambiguate\nhomophenes, and (2) acoustic variance information to model diverse speech\nstyles. Additionally, to better solve the aforementioned problem, we employ a\nflow based post-net which captures and refines the details of the generated\nspeech. We perform extensive experiments and demonstrate that our method\nachieves the generation quality close to that of real human utterance,\noutperforming existing methods in terms of speech naturalness and\nintelligibility by a large margin. Synthesised samples are available at the\nanonymous demo page: https://mm.kaist.ac.kr/projects/LTBS.\n","authors":["Ji-Hoon Kim","Jaehun Kim","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2308.15256v1.pdf","comment":"10 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2208.14407v2","updated":"2023-08-29T12:21:30Z","published":"2022-08-30T17:19:26Z","title":"An Analysis of Abstracted Model-Based Reinforcement Learning","summary":" Many methods for Model-based Reinforcement learning (MBRL) in Markov decision\nprocesses (MDPs) provide guarantees for both the accuracy of the model they can\ndeliver and the learning efficiency. At the same time, state abstraction\ntechniques allow for a reduction of the size of an MDP while maintaining a\nbounded loss with respect to the original problem. Therefore, it may come as a\nsurprise that no such guarantees are available when combining both techniques,\ni.e., where MBRL merely observes abstract states. Our theoretical analysis\nshows that abstraction can introduce a dependence between samples collected\nonline (e.g., in the real world). That means that, without taking this\ndependence into account, results for MBRL do not directly extend to this\nsetting. Our result shows that we can use concentration inequalities for\nmartingales to overcome this problem. This result makes it possible to extend\nthe guarantees of existing MBRL algorithms to the setting with abstraction. We\nillustrate this by combining R-MAX, a prototypical MBRL algorithm, with\nabstraction, thus producing the first performance guarantees for model-based\n`RL from Abstracted Observations': model-based reinforcement learning with an\nabstract model.\n","authors":["Rolf A. N. Starre","Marco Loog","Elena Congeduti","Frans A. Oliehoek"],"pdf_url":"https://arxiv.org/pdf/2208.14407v2.pdf","comment":"36 pages, 2 figures, submitted to TMLR"},{"id":"http://arxiv.org/abs/2206.07785v4","updated":"2023-08-29T12:19:19Z","published":"2022-06-15T19:48:10Z","title":"Strategic Coalition for Data Pricing in IoT Data Markets","summary":" This paper considers a market for trading Internet of Things (IoT) data that\nis used to train machine learning models. The data, either raw or processed, is\nsupplied to the market platform through a network and the price of such data is\ncontrolled based on the value it brings to the machine learning model. We\nexplore the correlation property of data in a game-theoretical setting to\neventually derive a simplified distributed solution for a data trading\nmechanism that emphasizes the mutual benefit of devices and the market. The key\nproposal is an efficient algorithm for markets that jointly addresses the\nchallenges of availability and heterogeneity in participation, as well as the\ntransfer of trust and the economic value of data exchange in IoT networks. The\nproposed approach establishes the data market by reinforcing collaboration\nopportunities between device with correlated data to avoid information leakage.\nTherein, we develop a network-wide optimization problem that maximizes the\nsocial value of coalition among the IoT devices of similar data types; at the\nsame time, it minimizes the cost due to network externalities, i.e., the impact\nof information leakage due to data correlation, as well as the opportunity\ncosts. Finally, we reveal the structure of the formulated problem as a\ndistributed coalition game and solve it following the simplified\nsplit-and-merge algorithm. Simulation results show the efficacy of our proposed\nmechanism design toward a trusted IoT data market, with up to 32.72% gain in\nthe average payoff for each seller.\n","authors":["Shashi Raj Pandey","Pierre Pinson","Petar Popovski"],"pdf_url":"https://arxiv.org/pdf/2206.07785v4.pdf","comment":"15 pages. 12 figures. This paper has been accepted for publication in\n IEEE Internet of Things Journal. Copyright may change without notice"},{"id":"http://arxiv.org/abs/2308.15250v1","updated":"2023-08-29T12:16:57Z","published":"2023-08-29T12:16:57Z","title":"The Relative Gaussian Mechanism and its Application to Private Gradient\n Descent","summary":" The Gaussian Mechanism (GM), which consists in adding Gaussian noise to a\nvector-valued query before releasing it, is a standard privacy protection\nmechanism. In particular, given that the query respects some L2 sensitivity\nproperty (the L2 distance between outputs on any two neighboring inputs is\nbounded), GM guarantees R\\'enyi Differential Privacy (RDP). Unfortunately,\nprecisely bounding the L2 sensitivity can be hard, thus leading to loose\nprivacy bounds. In this work, we consider a Relative L2 sensitivity assumption,\nin which the bound on the distance between two query outputs may also depend on\ntheir norm. Leveraging this assumption, we introduce the Relative Gaussian\nMechanism (RGM), in which the variance of the noise depends on the norm of the\noutput. We prove tight bounds on the RDP parameters under relative L2\nsensitivity, and characterize the privacy loss incurred by using\noutput-dependent noise. In particular, we show that RGM naturally adapts to a\nlatent variable that would control the norm of the output. Finally, we\ninstantiate our framework to show tight guarantees for Private Gradient\nDescent, a problem that naturally fits our relative L2 sensitivity assumption.\n","authors":["Hadrien Hendrikx","Paul Mangold","Aurélien Bellet"],"pdf_url":"https://arxiv.org/pdf/2308.15250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15243v1","updated":"2023-08-29T12:09:22Z","published":"2023-08-29T12:09:22Z","title":"Reliability Gaps Between Groups in COMPAS Dataset","summary":" This paper investigates the inter-rater reliability of risk assessment\ninstruments (RAIs). The main question is whether different, socially salient\ngroups are affected differently by a lack of inter-rater reliability of RAIs,\nthat is, whether mistakes with respect to different groups affects them\ndifferently. The question is investigated with a simulation study of the COMPAS\ndataset. A controlled degree of noise is injected into the input data of a\npredictive model; the noise can be interpreted as a synthetic rater that makes\nmistakes. The main finding is that there are systematic differences in output\nreliability between groups in the COMPAS dataset. The sign of the difference\ndepends on the kind of inter-rater statistic that is used (Cohen's Kappa,\nByrt's PABAK, ICC), and in particular whether or not a correction of\npredictions prevalences of the groups is used.\n","authors":["Tim Räz"],"pdf_url":"https://arxiv.org/pdf/2308.15243v1.pdf","comment":"15 pages + appendix"},{"id":"http://arxiv.org/abs/2308.15237v1","updated":"2023-08-29T11:52:31Z","published":"2023-08-29T11:52:31Z","title":"Assessing Cyclostationary Malware Detection via Feature Selection and\n Classification","summary":" Cyclostationarity involves periodic statistical variations in signals and\nprocesses, commonly used in signal analysis and network security. In the\ncontext of attacks, cyclostationarity helps detect malicious behaviors within\nnetwork traffic, such as traffic patterns in Distributed Denial of Service\n(DDoS) attacks or hidden communication channels in malware. This approach\nenhances security by identifying abnormal patterns and informing Network\nIntrusion Detection Systems (NIDSs) to recognize potential attacks, enhancing\nprotection against both known and novel threats. This research focuses on\nidentifying cyclostationary malware behavior and its detection. The main goal\nis to pinpoint essential cyclostationary features used in NIDSs. These features\nare extracted using algorithms such as Boruta and Principal Component Analysis\n(PCA), and then categorized to find the most significant cyclostationary\npatterns. The aim of this article is to reveal periodically changing malware\nbehaviors through cyclostationarity. The study highlights the importance of\nspotting cyclostationary malware in NIDSs by using established datasets like\nKDD99, NSL-KDD, and the UGRansome dataset. The UGRansome dataset is designed\nfor anomaly detection research and includes both normal and abnormal network\nthreat categories of zero-day attacks. A comparison is made using the Random\nForest (RF) and Support Vector Machine (SVM) algorithms, while also evaluating\nthe effectiveness of Boruta and PCA. The findings show that PCA is more\npromising than using Boruta alone for extracting cyclostationary network\nfeature patterns. Additionally, the analysis identifies the internet protocol\nas the most noticeable cyclostationary feature pattern used by malware.\nNotably, the UGRansome dataset outperforms the KDD99 and NSL-KDD, achieving 99%\naccuracy in signature malware detection using the RF algorithm and 98% with the\nSVM.\n","authors":["Mike Nkongolo"],"pdf_url":"https://arxiv.org/pdf/2308.15237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05516v2","updated":"2023-08-29T11:42:29Z","published":"2023-02-10T21:40:37Z","title":"Cyclic and Randomized Stepsizes Invoke Heavier Tails in SGD than\n Constant Stepsize","summary":" Cyclic and randomized stepsizes are widely used in the deep learning practice\nand can often outperform standard stepsize choices such as constant stepsize in\nSGD. Despite their empirical success, not much is currently known about when\nand why they can theoretically improve the generalization performance. We\nconsider a general class of Markovian stepsizes for learning, which contain\ni.i.d. random stepsize, cyclic stepsize as well as the constant stepsize as\nspecial cases, and motivated by the literature which shows that heaviness of\nthe tails (measured by the so-called \"tail-index\") in the SGD iterates is\ncorrelated with generalization, we study tail-index and provide a number of\ntheoretical results that demonstrate how the tail-index varies on the stepsize\nscheduling. Our results bring a new understanding of the benefits of cyclic and\nrandomized stepsizes compared to constant stepsize in terms of the tail\nbehavior. We illustrate our theory on linear regression experiments and show\nthrough deep learning experiments that Markovian stepsizes can achieve even a\nheavier tail and be a viable alternative to cyclic and i.i.d. randomized\nstepsize rules.\n","authors":["Mert Gürbüzbalaban","Yuanhan Hu","Umut Şimşekli","Lingjiong Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.05516v2.pdf","comment":"To Appear"},{"id":"http://arxiv.org/abs/2308.15232v1","updated":"2023-08-29T11:40:24Z","published":"2023-08-29T11:40:24Z","title":"Classification-Aware Neural Topic Model Combined With Interpretable\n Analysis -- For Conflict Classification","summary":" A large number of conflict events are affecting the world all the time. In\norder to analyse such conflict events effectively, this paper presents a\nClassification-Aware Neural Topic Model (CANTM-IA) for Conflict Information\nClassification and Topic Discovery. The model provides a reliable\ninterpretation of classification results and discovered topics by introducing\ninterpretability analysis. At the same time, interpretation is introduced into\nthe model architecture to improve the classification performance of the model\nand to allow interpretation to focus further on the details of the data.\nFinally, the model architecture is optimised to reduce the complexity of the\nmodel.\n","authors":["Tianyu Liang","Yida Mu","Soonho Kim","Darline Larissa Kengne Kuate","Julie Lang","Rob Vos","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.15232v1.pdf","comment":"Accepted by RANLP 2023"},{"id":"http://arxiv.org/abs/2308.15230v1","updated":"2023-08-29T11:37:33Z","published":"2023-08-29T11:37:33Z","title":"Providing Previously Unseen Users Fair Recommendations Using Variational\n Autoencoders","summary":" An emerging definition of fairness in machine learning requires that models\nare oblivious to demographic user information, e.g., a user's gender or age\nshould not influence the model. Personalized recommender systems are\nparticularly prone to violating this definition through their explicit user\nfocus and user modelling. Explicit user modelling is also an aspect that makes\nmany recommender systems incapable of providing hitherto unseen users with\nrecommendations. We propose novel approaches for mitigating discrimination in\nVariational Autoencoder-based recommender systems by limiting the encoding of\ndemographic information. The approaches are capable of, and evaluated on,\nproviding users that are not represented in the training data with fair\nrecommendations.\n","authors":["Bjørnar Vassøy","Helge Langseth","Benjamin Kille"],"pdf_url":"https://arxiv.org/pdf/2308.15230v1.pdf","comment":"Appearing in RecSys 2023 proceedings"},{"id":"http://arxiv.org/abs/2308.15223v1","updated":"2023-08-29T11:24:12Z","published":"2023-08-29T11:24:12Z","title":"Evaluating Explanation Methods for Multivariate Time Series\n Classification","summary":" Multivariate time series classification is an important computational task\narising in applications where data is recorded over time and over multiple\nchannels. For example, a smartwatch can record the acceleration and orientation\nof a person's motion, and these signals are recorded as multivariate time\nseries. We can classify this data to understand and predict human movement and\nvarious properties such as fitness levels. In many applications classification\nalone is not enough, we often need to classify but also understand what the\nmodel learns (e.g., why was a prediction given, based on what information in\nthe data). The main focus of this paper is on analysing and evaluating\nexplanation methods tailored to Multivariate Time Series Classification (MTSC).\nWe focus on saliency-based explanation methods that can point out the most\nrelevant channels and time series points for the classification decision. We\nanalyse two popular and accurate multivariate time series classifiers, ROCKET\nand dResNet, as well as two popular explanation methods, SHAP and dCAM. We\nstudy these methods on 3 synthetic datasets and 2 real-world datasets and\nprovide a quantitative and qualitative analysis of the explanations provided.\nWe find that flattening the multivariate datasets by concatenating the channels\nworks as well as using multivariate classifiers directly and adaptations of\nSHAP for MTSC work quite well. Additionally, we also find that the popular\nsynthetic datasets we used are not suitable for time series analysis.\n","authors":["Davide Italo Serramazza","Thu Trang Nguyen","Thach Le Nguyen","Georgiana Ifrim"],"pdf_url":"https://arxiv.org/pdf/2308.15223v1.pdf","comment":"Accepted at AALTD '23"},{"id":"http://arxiv.org/abs/2304.12154v2","updated":"2023-08-29T11:19:40Z","published":"2023-04-24T15:05:04Z","title":"Explainable AI Insights for Symbolic Computation: A case study on\n selecting the variable ordering for cylindrical algebraic decomposition","summary":" In recent years there has been increased use of machine learning (ML)\ntechniques within mathematics, including symbolic computation where it may be\napplied safely to optimise or select algorithms. This paper explores whether\nusing explainable AI (XAI) techniques on such ML models can offer new insight\nfor symbolic computation, inspiring new implementations within computer algebra\nsystems that do not directly call upon AI tools. We present a case study on the\nuse of ML to select the variable ordering for cylindrical algebraic\ndecomposition. It has already been demonstrated that ML can make the choice\nwell, but here we show how the SHAP tool for explainability can be used to\ninform new heuristics of a size and complexity similar to those human-designed\nheuristics currently commonly used in symbolic computation.\n","authors":["Lynn Pickering","Tereso Del Rio Almajano","Matthew England","Kelly Cohen"],"pdf_url":"https://arxiv.org/pdf/2304.12154v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2301.03364v4","updated":"2023-08-29T11:18:43Z","published":"2022-12-20T15:04:20Z","title":"Towards an AI-enabled Connected Industry: AGV Communication and Sensor\n Measurement Datasets","summary":" This paper presents two wireless measurement campaigns in industrial\ntestbeds: industrial Vehicle-to-vehicle (iV2V) and industrial\nVehicle-to-infrastructure plus Sensor (iV2I+), together with detailed\ninformation about the two captured datasets. iV2V covers sidelink communication\nscenarios between Automated Guided Vehicles (AGVs), while iV2I+ is conducted at\nan industrial setting where an autonomous cleaning robot is connected to a\nprivate cellular network. The combination of different communication\ntechnologies within a common measurement methodology provides insights that can\nbe exploited by Machine Learning (ML) for tasks such as fingerprinting,\nline-of-sight detection, prediction of quality of service or link selection.\nMoreover, the datasets are publicly available, labelled and prefiltered for\nfast on-boarding and applicability.\n","authors":["Rodrigo Hernangómez","Alexandros Palaios","Cara Watermann","Daniel Schäufele","Philipp Geuer","Rafail Ismayilov","Mohammad Parvini","Anton Krause","Martin Kasparick","Thomas Neugebauer","Oscar D. Ramos-Cantor","Hugues Tchouankem","Jose Leon Calvo","Bo Chen","Gerhard Fettweis","Sławomir Stańczak"],"pdf_url":"https://arxiv.org/pdf/2301.03364v4.pdf","comment":"7 pages, 3 figures. Submitted to a magazine. Datasets available at\n https://ieee-dataport.org/open-access/ai4mobile-industrial-wireless-datasets-iv2v-and-iv2i"},{"id":"http://arxiv.org/abs/2211.14573v3","updated":"2023-08-29T10:59:41Z","published":"2022-11-26T14:00:18Z","title":"Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation\n for Pretrained Deep Generative Model","summary":" Semantic editing of images is the fundamental goal of computer vision.\nAlthough deep learning methods, such as generative adversarial networks (GANs),\nare capable of producing high-quality images, they often do not have an\ninherent way of editing generated images semantically. Recent studies have\ninvestigated a way of manipulating the latent variable to determine the images\nto be generated. However, methods that assume linear semantic arithmetic have\ncertain limitations in terms of the quality of image editing, whereas methods\nthat discover nonlinear semantic pathways provide non-commutative editing,\nwhich is inconsistent when applied in different orders. This study proposes a\nnovel method called deep curvilinear editing (DeCurvEd) to determine semantic\ncommuting vector fields on the latent space. We theoretically demonstrate that\nowing to commutativity, the editing of multiple attributes depends only on the\nquantities and not on the order. Furthermore, we experimentally demonstrate\nthat compared to previous methods, the nonlinear and commutative nature of\nDeCurvEd facilitates the disentanglement of image attributes and provides\nhigher-quality editing.\n","authors":["Takehiro Aoshima","Takashi Matsubara"],"pdf_url":"https://arxiv.org/pdf/2211.14573v3.pdf","comment":"15 pages. The last update made no changes except for adding the\n following link to the CVF repository:\n https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html.\n Here, you can find our code to reproduce our results"},{"id":"http://arxiv.org/abs/2205.09862v2","updated":"2023-08-29T10:47:39Z","published":"2022-05-19T21:10:52Z","title":"Recurrent segmentation meets block models in temporal networks","summary":" A popular approach to model interactions is to represent them as a network\nwith nodes being the agents and the interactions being the edges. Interactions\nare often timestamped, which leads to having timestamped edges. Many real-world\ntemporal networks have a recurrent or possibly cyclic behaviour. For example,\nsocial network activity may be heightened during certain hours of day. In this\npaper, our main interest is to model recurrent activity in such temporal\nnetworks. As a starting point we use stochastic block model, a popular choice\nfor modelling static networks, where nodes are split into $R$ groups. We extend\nthis model to temporal networks by modelling the edges with a Poisson process.\nWe make the parameters of the process dependent on time by segmenting the time\nline into $K$ segments. To enforce the recurring activity we require that only\n$H < K$ different set of parameters can be used, that is, several, not\nnecessarily consecutive, segments must share their parameters. We prove that\nthe searching for optimal blocks and segmentation is an NP-hard problem.\nConsequently, we split the problem into 3 subproblems where we optimize blocks,\nmodel parameters, and segmentation in turn while keeping the remaining\nstructures fixed. We propose an iterative algorithm that requires $O(KHm + Rn +\nR^2H)$ time per iteration, where $n$ and $m$ are the number of nodes and edges\nin the network. We demonstrate experimentally that the number of required\niterations is typically low, the algorithm is able to discover the ground truth\nfrom synthetic datasets, and show that certain real-world networks exhibit\nrecurrent behaviour as the likelihood does not deteriorate when $H$ is lowered.\n","authors":["Chamalee Wickrama Arachchi","Nikolaj Tatti"],"pdf_url":"https://arxiv.org/pdf/2205.09862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15194v1","updated":"2023-08-29T10:21:50Z","published":"2023-08-29T10:21:50Z","title":"Ensemble of Counterfactual Explainers","summary":" In eXplainable Artificial Intelligence (XAI), several counterfactual\nexplainers have been proposed, each focusing on some desirable properties of\ncounterfactual instances: minimality, actionability, stability, diversity,\nplausibility, discriminative power. We propose an ensemble of counterfactual\nexplainers that boosts weak explainers, which provide only a subset of such\nproperties, to a powerful method covering all of them. The ensemble runs weak\nexplainers on a sample of instances and of features, and it combines their\nresults by exploiting a diversity-driven selection function. The method is\nmodel-agnostic and, through a wrapping approach based on autoencoders, it is\nalso data-agnostic.\n","authors":["Riccardo Guidotti","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2308.15194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11705v2","updated":"2023-08-29T10:08:24Z","published":"2023-04-23T17:43:29Z","title":"Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR\n Semantic Segmentation","summary":" The ability to deploy robots that can operate safely in diverse environments\nis crucial for developing embodied intelligent agents. As a community, we have\nmade tremendous progress in within-domain LiDAR semantic segmentation. However,\ndo these methods generalize across domains? To answer this question, we design\nthe first experimental setup for studying domain generalization (DG) for LiDAR\nsemantic segmentation (DG-LSS). Our results confirm a significant gap between\nmethods, evaluated in a cross-domain setting: for example, a model trained on\nthe source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data,\ncompared to $48.49$ mIoU obtained by the model trained on the target domain\n(nuScenes). To tackle this gap, we propose the first method specifically\ndesigned for DG-LSS, which obtains $34.88$ mIoU on the target domain,\noutperforming all baselines. Our method augments a sparse-convolutional\nencoder-decoder 3D segmentation network with an additional, dense 2D\nconvolutional decoder that learns to classify a birds-eye view of the point\ncloud. This simple auxiliary task encourages the 3D network to learn features\nthat are robust to sensor placement shifts and resolution, and are transferable\nacross domains. With this work, we aim to inspire the community to develop and\nevaluate future models in such cross-domain conditions.\n","authors":["Cristiano Saltori","Aljoša Ošep","Elisa Ricci","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2304.11705v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15172v1","updated":"2023-08-29T09:54:30Z","published":"2023-08-29T09:54:30Z","title":"Is visual explanation with Grad-CAM more reliable for deeper neural\n networks? a case study with automatic pneumothorax diagnosis","summary":" While deep learning techniques have provided the state-of-the-art performance\nin various clinical tasks, explainability regarding their decision-making\nprocess can greatly enhance the credence of these methods for safer and quicker\nclinical adoption. With high flexibility, Gradient-weighted Class Activation\nMapping (Grad-CAM) has been widely adopted to offer intuitive visual\ninterpretation of various deep learning models' reasoning processes in\ncomputer-assisted diagnosis. However, despite the popularity of the technique,\nthere is still a lack of systematic study on Grad-CAM's performance on\ndifferent deep learning architectures. In this study, we investigate its\nrobustness and effectiveness across different popular deep learning models,\nwith a focus on the impact of the networks' depths and architecture types, by\nusing a case study of automatic pneumothorax diagnosis in X-ray scans. Our\nresults show that deeper neural networks do not necessarily contribute to a\nstrong improvement of pneumothorax diagnosis accuracy, and the effectiveness of\nGradCAM also varies among different network architectures.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.15172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15164v1","updated":"2023-08-29T09:46:52Z","published":"2023-08-29T09:46:52Z","title":"ABS-SGD: A Delayed Synchronous Stochastic Gradient Descent Algorithm\n with Adaptive Batch Size for Heterogeneous GPU Clusters","summary":" As the size of models and datasets grows, it has become increasingly common\nto train models in parallel. However, existing distributed stochastic gradient\ndescent (SGD) algorithms suffer from insufficient utilization of computational\nresources and poor convergence in heterogeneous clusters. In this paper, we\npropose a delayed synchronous SGD algorithm with adaptive batch size (ABS-SGD)\nfor heterogeneous GPU clusters. In ABS-SGD, workers perform global\nsynchronization to accumulate delayed gradients and use the accumulated delayed\ngradients to update parameters. While workers are performing global\nsynchronization for delayed gradients, they perform the computation of the next\nbatch without specifying batch size in advance, which lasts until the next\nglobal synchronization starts, realizing the full utilization of computational\nresources. Since the gradient delay is only one iteration, the stale gradient\nproblem can be alleviated. We theoretically prove the convergence of ABS-SGD in\nheterogeneous clusters. Extensive experiments in three types of heterogeneous\nclusters demonstrate that ABS-SGD can make full use of computational resources\nand accelerate model convergence: When training ResNet18 network with 4\nworkers, ABS-SGD increases the convergence speed by 1.30x on average compared\nwith the best baseline algorithm.\n","authors":["Xin Zhou","Ling Chen","Houming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.15164v1.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.15157v1","updated":"2023-08-29T09:39:12Z","published":"2023-08-29T09:39:12Z","title":"On the improvement of model-predictive controllers","summary":" This article investigates synthetic model-predictive control (MPC) problems\nto demonstrate that an increased precision of the internal prediction model\n(PM) automatially entails an improvement of the controller as a whole. In\ncontrast to reinforcement learning (RL), MPC uses the PM to predict subsequent\nstates of the controlled system (CS), instead of directly recommending suitable\nactions. To assess how the precision of the PM translates into the quality of\nthe model-predictive controller, we compare a DNN-based PM to the optimal\nbaseline PM for three well-known control problems of varying complexity. The\nbaseline PM achieves perfect accuracy by accessing the simulation of the CS\nitself. Based on the obtained results, we argue that an improvement of the PM\nwill always improve the controller as a whole, without considering the impact\nof other components such as action selection (which, in this article, relies on\nevolutionary optimization).\n","authors":["L. Féret","A. Gepperth","S. Lambeck"],"pdf_url":"https://arxiv.org/pdf/2308.15157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01055v2","updated":"2023-08-29T09:38:14Z","published":"2023-03-02T08:24:27Z","title":"Physics-informed neural networks for solving forward and inverse\n problems in complex beam systems","summary":" This paper proposes a new framework using physics-informed neural networks\n(PINNs) to simulate complex structural systems that consist of single and\ndouble beams based on Euler-Bernoulli and Timoshenko theory, where the double\nbeams are connected with a Winkler foundation. In particular, forward and\ninverse problems for the Euler-Bernoulli and Timoshenko partial differential\nequations (PDEs) are solved using nondimensional equations with the\nphysics-informed loss function. Higher-order complex beam PDEs are efficiently\nsolved for forward problems to compute the transverse displacements and\ncross-sectional rotations with less than 1e-3 percent error. Furthermore,\ninverse problems are robustly solved to determine the unknown dimensionless\nmodel parameters and applied force in the entire space-time domain, even in the\ncase of noisy data. The results suggest that PINNs are a promising strategy for\nsolving problems in engineering structures and machines involving beam systems.\n","authors":["Taniya Kapoor","Hongrui Wang","Alfredo Nunez","Rolf Dollevoet"],"pdf_url":"https://arxiv.org/pdf/2303.01055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13821v2","updated":"2023-08-29T09:31:11Z","published":"2023-08-26T09:11:44Z","title":"A Survey of Imbalanced Learning on Graphs: Problems, Techniques, and\n Future Directions","summary":" Graphs represent interconnected structures prevalent in a myriad of\nreal-world scenarios. Effective graph analytics, such as graph learning\nmethods, enables users to gain profound insights from graph data, underpinning\nvarious tasks including node classification and link prediction. However, these\nmethods often suffer from data imbalance, a common issue in graph data where\ncertain segments possess abundant data while others are scarce, thereby leading\nto biased learning outcomes. This necessitates the emerging field of imbalanced\nlearning on graphs, which aims to correct these data distribution skews for\nmore accurate and representative learning outcomes. In this survey, we embark\non a comprehensive review of the literature on imbalanced learning on graphs.\nWe begin by providing a definitive understanding of the concept and related\nterminologies, establishing a strong foundational understanding for readers.\nFollowing this, we propose two comprehensive taxonomies: (1) the problem\ntaxonomy, which describes the forms of imbalance we consider, the associated\ntasks, and potential solutions; (2) the technique taxonomy, which details key\nstrategies for addressing these imbalances, and aids readers in their method\nselection process. Finally, we suggest prospective future directions for both\nproblems and techniques within the sphere of imbalanced learning on graphs,\nfostering further innovation in this critical area.\n","authors":["Zemin Liu","Yuan Li","Nan Chen","Qian Wang","Bryan Hooi","Bingsheng He"],"pdf_url":"https://arxiv.org/pdf/2308.13821v2.pdf","comment":"The collection of awesome literature on imbalanced learning on\n graphs: https://github.com/Xtra-Computing/Awesome-Literature-ILoGs"},{"id":"http://arxiv.org/abs/2210.00991v2","updated":"2023-08-29T09:23:24Z","published":"2022-10-03T14:57:46Z","title":"Policy Gradient for Reinforcement Learning with General Utilities","summary":" In Reinforcement Learning (RL), the goal of agents is to discover an optimal\npolicy that maximizes the expected cumulative rewards. This objective may also\nbe viewed as finding a policy that optimizes a linear function of its\nstate-action occupancy measure, hereafter referred as Linear RL. However, many\nsupervised and unsupervised RL problems are not covered in the Linear RL\nframework, such as apprenticeship learning, pure exploration and variational\nintrinsic control, where the objectives are non-linear functions of the\noccupancy measures. RL with non-linear utilities looks unwieldy, as methods\nlike Bellman equation, value iteration, policy gradient, dynamic programming\nthat had tremendous success in Linear RL, fail to trivially generalize. In this\npaper, we derive the policy gradient theorem for RL with general utilities. The\npolicy gradient theorem proves to be a cornerstone in Linear RL due to its\nelegance and ease of implementability. Our policy gradient theorem for RL with\ngeneral utilities shares the same elegance and ease of implementability. Based\non the policy gradient theorem derived, we also present a simple sample-based\nalgorithm. We believe our results will be of interest to the community and\noffer inspiration to future works in this generalized setting.\n","authors":["Navdeep Kumar","Kaixin Wang","Kfir Levy","Shie Mannor"],"pdf_url":"https://arxiv.org/pdf/2210.00991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15141v1","updated":"2023-08-29T09:19:49Z","published":"2023-08-29T09:19:49Z","title":"Uncertainty Aware Training to Improve Deep Learning Model Calibration\n for Classification of Cardiac MR Images","summary":" Quantifying uncertainty of predictions has been identified as one way to\ndevelop more trustworthy artificial intelligence (AI) models beyond\nconventional reporting of performance metrics. When considering their role in a\nclinical decision support setting, AI classification models should ideally\navoid confident wrong predictions and maximise the confidence of correct\npredictions. Models that do this are said to be well-calibrated with regard to\nconfidence. However, relatively little attention has been paid to how to\nimprove calibration when training these models, i.e., to make the training\nstrategy uncertainty-aware. In this work we evaluate three novel\nuncertainty-aware training strategies comparing against two state-of-the-art\napproaches. We analyse performance on two different clinical applications:\ncardiac resynchronisation therapy (CRT) response prediction and coronary artery\ndisease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The\nbest-performing model in terms of both classification accuracy and the most\ncommon calibration measure, expected calibration error (ECE) was the Confidence\nWeight method, a novel approach that weights the loss of samples to explicitly\npenalise confident incorrect predictions. The method reduced the ECE by 17% for\nCRT response prediction and by 22% for CAD diagnosis when compared to a\nbaseline classifier in which no uncertainty-aware strategy was included. In\nboth applications, as well as reducing the ECE there was a slight increase in\naccuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD\ndiagnosis respectively. However, our analysis showed a lack of consistency in\nterms of optimal models when using different calibration measures. This\nindicates the need for careful consideration of performance metrics when\ntraining and selecting models for complex high-risk applications in healthcare.\n","authors":["Tareen Dawood","Chen Chen","Baldeep S. Sidhua","Bram Ruijsink","Justin Goulda","Bradley Porter","Mark K. Elliott","Vishal Mehta","Christopher A. Rinaldi","Esther Puyol-Anton","Reza Razavi","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2308.15141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07015v2","updated":"2023-08-29T09:17:05Z","published":"2023-06-12T10:39:57Z","title":"Combining Primal and Dual Representations in Deep Restricted Kernel\n Machines Classifiers","summary":" In the context of deep learning with kernel machines, the deep Restricted\nKernel Machine (DRKM) framework allows multiple levels of kernel PCA (KPCA) and\nLeast-Squares Support Vector Machines (LSSVM) to be combined into a deep\narchitecture using visible and hidden units. We propose a new method for DRKM\nclassification coupling the objectives of KPCA and classification levels, with\nthe hidden feature matrix lying on the Stiefel manifold. The classification\nlevel can be formulated as an LSSVM or as an MLP feature map, combining depth\nin terms of levels and layers. The classification level is expressed in its\nprimal formulation, as the deep KPCA levels, in their dual formulation, can\nembed the most informative components of the data in a much lower dimensional\nspace. The dual setting is independent of the dimension of the inputs and the\nprimal setting is parametric, which makes the proposed method computationally\nefficient for both high-dimensional inputs and large datasets. In the\nexperiments, we show that our developed algorithm can effectively learn from\nsmall datasets, while using less memory than the convolutional neural network\n(CNN) with high-dimensional data. and that models with multiple KPCA levels can\noutperform models with a single level. On the tested larger-scale datasets,\nDRKM is more energy efficient than CNN while maintaining comparable\nperformance.\n","authors":["Francesco Tonin","Panagiotis Patrinos","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2306.07015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15132v1","updated":"2023-08-29T08:57:47Z","published":"2023-08-29T08:57:47Z","title":"Biquality Learning: a Framework to Design Algorithms Dealing with\n Closed-Set Distribution Shifts","summary":" Training machine learning models from data with weak supervision and dataset\nshifts is still challenging. Designing algorithms when these two situations\narise has not been explored much, and existing algorithms cannot always handle\nthe most complex distributional shifts. We think the biquality data setup is a\nsuitable framework for designing such algorithms. Biquality Learning assumes\nthat two datasets are available at training time: a trusted dataset sampled\nfrom the distribution of interest and the untrusted dataset with dataset shifts\nand weaknesses of supervision (aka distribution shifts). The trusted and\nuntrusted datasets available at training time make designing algorithms dealing\nwith any distribution shifts possible. We propose two methods, one inspired by\nthe label noise literature and another by the covariate shift literature for\nbiquality learning. We experiment with two novel methods to synthetically\nintroduce concept drift and class-conditional shifts in real-world datasets\nacross many of them. We opened some discussions and assessed that developing\nbiquality learning algorithms robust to distributional changes remains an\ninteresting problem for future research.\n","authors":["Pierre Nodet","Vincent Lemaire","Alexis Bondu","Antoine Cornuéjols"],"pdf_url":"https://arxiv.org/pdf/2308.15132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15126v1","updated":"2023-08-29T08:51:24Z","published":"2023-08-29T08:51:24Z","title":"Evaluation and Analysis of Hallucination in Large Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) have recently achieved remarkable\nsuccess. However, LVLMs are still plagued by the hallucination problem, which\nlimits the practicality in many scenarios. Hallucination refers to the\ninformation of LVLMs' responses that does not exist in the visual input, which\nposes potential risks of substantial consequences. There has been limited work\nstudying hallucination evaluation in LVLMs. In this paper, we propose\nHallucination Evaluation based on Large Language Models (HaELM), an LLM-based\nhallucination evaluation framework. HaELM achieves an approximate 95%\nperformance comparable to ChatGPT and has additional advantages including low\ncost, reproducibility, privacy preservation and local deployment. Leveraging\nthe HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we\nanalyze the factors contributing to hallucination in LVLMs and offer helpful\nsuggestions to mitigate the hallucination problem. Our training data and human\nannotation hallucination data will be made public soon.\n","authors":["Junyang Wang","Yiyang Zhou","Guohai Xu","Pengcheng Shi","Chenlin Zhao","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Jihua Zhu","Jitao Sang","Haoyu Tang"],"pdf_url":"https://arxiv.org/pdf/2308.15126v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2210.13004v2","updated":"2023-08-29T08:35:07Z","published":"2022-10-24T07:50:02Z","title":"Efficient Representation of Natural Image Patches","summary":" In the complex domain of neural information processing, discerning\nfundamental principles from ancillary details remains a significant challenge.\nWhile there is extensive knowledge about the anatomy and physiology of the\nearly visual system, a comprehensive computational theory remains elusive. Can\nwe gain insights into the underlying principles of a biological system by\nabstracting away from its detailed implementation and focusing on the\nfundamental problems that the system is designed to solve? Utilizing an\nabstract model based on minimal yet realistic assumptions, we show how to\nachieve the early visual system's two ultimate objectives: efficient\ninformation transmission and sensor probability distribution modeling. We show\nthat optimizing for information transmission does not yield optimal probability\ndistribution modeling. We illustrate, using a two-pixel (2D) system and image\npatches, that an efficient representation can be realized via nonlinear\npopulation code driven by two types of biologically plausible loss functions\nthat depend solely on output. After unsupervised learning, our abstract IPU\nmodel bears remarkable resemblances to biological systems, despite not\nmimicking many features of real neurons, such as spiking activity. A\npreliminary comparison with a contemporary deep learning model suggests that\nthe IPU model offers a significant efficiency advantage. Our model provides\nnovel insights into the computational theory of early visual systems as well as\na potential new approach to enhance the efficiency of deep learning models.\n","authors":["Cheng Guo"],"pdf_url":"https://arxiv.org/pdf/2210.13004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15116v1","updated":"2023-08-29T08:29:08Z","published":"2023-08-29T08:29:08Z","title":"Mixup-Augmented Meta-Learning for Sample-Efficient Fine-Tuning of\n Protein Simulators","summary":" Molecular dynamics simulations have emerged as a fundamental instrument for\nstudying biomolecules. At the same time, it is desirable to perform simulations\nof a collection of particles under various conditions in which the molecules\ncan fluctuate. In this paper, we explore and adapt the soft prompt-based\nlearning method to molecular dynamics tasks. Our model can remarkably\ngeneralize to unseen and out-of-distribution scenarios with limited training\ndata. While our work focuses on temperature as a test case, the versatility of\nour approach allows for efficient simulation through any continuous dynamic\nconditions, such as pressure and volumes. Our framework has two stages: 1)\nPre-trains with data mixing technique, augments molecular structure data and\ntemperature prompts, then applies a curriculum learning method by increasing\nthe ratio of them smoothly. 2) Meta-learning-based fine-tuning framework\nimproves sample-efficiency of fine-tuning process and gives the soft\nprompt-tuning better initialization points. Comprehensive experiments reveal\nthat our framework excels in accuracy for in-domain data and demonstrates\nstrong generalization capabilities for unseen and out-of-distribution samples.\n","authors":["Jingbang Chen","Yian Wang","Xingwei Qu","Shuangjia Zheng","Yaodong Yang","Hao Dong","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2308.15116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15107v1","updated":"2023-08-29T08:14:19Z","published":"2023-08-29T08:14:19Z","title":"Stochastic Graph Bandit Learning with Side-Observations","summary":" In this paper, we investigate the stochastic contextual bandit with general\nfunction space and graph feedback. We propose an algorithm that addresses this\nproblem by adapting to both the underlying graph structures and reward gaps. To\nthe best of our knowledge, our algorithm is the first to provide a\ngap-dependent upper bound in this stochastic setting, bridging the research gap\nleft by the work in [35]. In comparison to [31,33,35], our method offers\nimproved regret upper bounds and does not require knowledge of graphical\nquantities. We conduct numerical experiments to demonstrate the computational\nefficiency and effectiveness of our approach in terms of regret upper bounds.\nThese findings highlight the significance of our algorithm in advancing the\nfield of stochastic contextual bandits with graph feedback, opening up avenues\nfor practical applications in various domains.\n","authors":["Xueping Gong","Jiheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15107v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2010.03104 by other authors"},{"id":"http://arxiv.org/abs/2201.01079v5","updated":"2023-08-29T08:10:29Z","published":"2022-01-04T10:49:30Z","title":"Incomplete Multi-View Weak-Label Learning with Noisy Features and\n Imbalanced Labels","summary":" A variety of modern applications exhibit multi-view multi-label learning,\nwhere each sample has multi-view features, and multiple labels are correlated\nvia common views. Current methods usually fail to directly deal with the\nsetting where only a subset of features and labels are observed for each\nsample, and ignore the presence of noisy views and imbalanced labels in\nreal-world problems. In this paper, we propose a novel method to overcome the\nlimitations. It jointly embeds incomplete views and weak labels into a\nlow-dimensional subspace with adaptive weights, and facilitates the difference\nbetween embedding weight matrices via auto-weighted Hilbert-Schmidt\nIndependence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively\nlearns view-wise importance for embedding to detect noisy views, and mitigates\nthe label imbalance problem by focal loss. Experimental results on four\nreal-world multi-view multi-label datasets demonstrate the effectiveness of the\nproposed method.\n","authors":["Zhiwei Li","Zijian Yang","Lu Sun","Mineichi Kudo","Kego Kimura"],"pdf_url":"https://arxiv.org/pdf/2201.01079v5.pdf","comment":"6 pages, 2 figures, conference"},{"id":"http://arxiv.org/abs/2308.15096v1","updated":"2023-08-29T08:04:45Z","published":"2023-08-29T08:04:45Z","title":"How Faithful are Self-Explainable GNNs?","summary":" Self-explainable deep neural networks are a recent class of models that can\noutput ante-hoc local explanations that are faithful to the model's reasoning,\nand as such represent a step forward toward filling the gap between\nexpressiveness and interpretability. Self-explainable graph neural networks\n(GNNs) aim at achieving the same in the context of graph data. This begs the\nquestion: do these models fulfill their implicit guarantees in terms of\nfaithfulness? In this extended abstract, we analyze the faithfulness of several\nself-explainable GNNs using different measures of faithfulness, identify\nseveral limitations -- both in the models themselves and in the evaluation\nmetrics -- and outline possible ways forward.\n","authors":["Marc Christiansen","Lea Villadsen","Zhiqiang Zhong","Stefano Teso","Davide Mottin"],"pdf_url":"https://arxiv.org/pdf/2308.15096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15094v1","updated":"2023-08-29T08:02:41Z","published":"2023-08-29T08:02:41Z","title":"Group-Conditional Conformal Prediction via Quantile Regression\n Calibration for Crop and Weed Classification","summary":" As deep learning predictive models become an integral part of a large\nspectrum of precision agricultural systems, a barrier to the adoption of such\nautomated solutions is the lack of user trust in these highly complex, opaque\nand uncertain models. Indeed, deep neural networks are not equipped with any\nexplicit guarantees that can be used to certify the system's performance,\nespecially in highly varying uncontrolled environments such as the ones\ntypically faced in computer vision for agriculture.Fortunately, certain methods\ndeveloped in other communities can prove to be important for agricultural\napplications. This article presents the conformal prediction framework that\nprovides valid statistical guarantees on the predictive performance of any\nblack box prediction machine, with almost no assumptions, applied to the\nproblem of deep visual classification of weeds and crops in real-world\nconditions. The framework is exposed with a focus on its practical aspects and\nspecial attention accorded to the Adaptive Prediction Sets (APS) approach that\ndelivers marginal guarantees on the model's coverage. Marginal results are then\nshown to be insufficient to guarantee performance on all groups of individuals\nin the population as characterized by their environmental and pedo-climatic\nauxiliary data gathered during image acquisition.To tackle this shortcoming,\ngroup-conditional conformal approaches are presented: the ''classical'' method\nthat consists of iteratively applying the APS procedure on all groups, and a\nproposed elegant reformulation and implementation of the procedure using\nquantile regression on group membership indicators. Empirical results showing\nthe validity of the proposed approach are presented and compared to the\nmarginal APS then discussed.\n","authors":["Paul Melki","Lionel Bombrun","Boubacar Diallo","Jérôme Dias","Jean-Pierre da Costa"],"pdf_url":"https://arxiv.org/pdf/2308.15094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15092v1","updated":"2023-08-29T07:58:19Z","published":"2023-08-29T07:58:19Z","title":"Can We Rely on AI?","summary":" Over the last decade, adversarial attack algorithms have revealed\ninstabilities in deep learning tools. These algorithms raise issues regarding\nsafety, reliability and interpretability in artificial intelligence; especially\nin high risk settings. From a practical perspective, there has been a war of\nescalation between those developing attack and defence strategies. At a more\ntheoretical level, researchers have also studied bigger picture questions\nconcerning the existence and computability of attacks. Here we give a brief\noverview of the topic, focusing on aspects that are likely to be of interest to\nresearchers in applied and computational mathematics.\n","authors":["Desmond J. Higham"],"pdf_url":"https://arxiv.org/pdf/2308.15092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15088v1","updated":"2023-08-29T07:51:36Z","published":"2023-08-29T07:51:36Z","title":"Using deep learning for an automatic detection and classification of the\n vascular bifurcations along the Circle of Willis","summary":" Most of the intracranial aneurysms (ICA) occur on a specific portion of the\ncerebral vascular tree named the Circle of Willis (CoW). More particularly,\nthey mainly arise onto fifteen of the major arterial bifurcations constituting\nthis circular structure. Hence, for an efficient and timely diagnosis it is\ncritical to develop some methods being able to accurately recognize each\nBifurcation of Interest (BoI). Indeed, an automatic extraction of the\nbifurcations presenting the higher risk of developing an ICA would offer the\nneuroradiologists a quick glance at the most alarming areas. Due to the recent\nefforts on Artificial Intelligence, Deep Learning turned out to be the best\nperforming technology for many pattern recognition tasks. Moreover, various\nmethods have been particularly designed for medical image analysis purposes.\nThis study intends to assist the neuroradiologists to promptly locate any\nbifurcation presenting a high risk of ICA occurrence. It can be seen as a\nComputer Aided Diagnosis scheme, where the Artificial Intelligence facilitates\nthe access to the regions of interest within the MRI. In this work, we propose\na method for a fully automatic detection and recognition of the bifurcations of\ninterest forming the Circle of Willis. Several neural networks architectures\nhave been tested, and we thoroughly evaluate the bifurcation recognition rate.\n","authors":["Rafic Nader","Romain Bourcier","Florent Autrusseau"],"pdf_url":"https://arxiv.org/pdf/2308.15088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10398v2","updated":"2023-08-29T07:33:15Z","published":"2023-04-20T15:34:20Z","title":"Multi-label Node Classification On Graph-Structured Data","summary":" Graph Neural Networks (GNNs) have shown state-of-the-art improvements in node\nclassification tasks on graphs. While these improvements have been largely\ndemonstrated in a multi-class classification scenario, a more general and\nrealistic scenario in which each node could have multiple labels has so far\nreceived little attention. The first challenge in conducting focused studies on\nmulti-label node classification is the limited number of publicly available\nmulti-label graph datasets. Therefore, as our first contribution, we collect\nand release three real-world biological datasets and develop a multi-label\ngraph generator to generate datasets with tunable properties. While high label\nsimilarity (high homophily) is usually attributed to the success of GNNs, we\nargue that a multi-label scenario does not follow the usual semantics of\nhomophily and heterophily so far defined for a multi-class scenario. As our\nsecond contribution, besides defining homophily for the multi-label scenario,\nwe develop a new approach that dynamically fuses the feature and label\ncorrelation information to learn label-informed representations. Finally, we\nperform a large-scale comparative study with $10$ methods and $9$ datasets\nwhich also showcase the effectiveness of our approach. We release our benchmark\nat \\url{https://anonymous.4open.science/r/LFLF-5D8C/}.\n","authors":["Tianqi Zhao","Ngan Thi Dong","Alan Hanjalic","Megha Khosla"],"pdf_url":"https://arxiv.org/pdf/2304.10398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15074v1","updated":"2023-08-29T07:15:57Z","published":"2023-08-29T07:15:57Z","title":"Exploring Model Transferability through the Lens of Potential Energy","summary":" Transfer learning has become crucial in computer vision tasks due to the vast\navailability of pre-trained deep learning models. However, selecting the\noptimal pre-trained model from a diverse pool for a specific downstream task\nremains a challenge. Existing methods for measuring the transferability of\npre-trained models rely on statistical correlations between encoded static\nfeatures and task labels, but they overlook the impact of underlying\nrepresentation dynamics during fine-tuning, leading to unreliable results,\nespecially for self-supervised models. In this paper, we present an insightful\nphysics-inspired approach named PED to address these challenges. We reframe the\nchallenge of model selection through the lens of potential energy and directly\nmodel the interaction forces that influence fine-tuning dynamics. By capturing\nthe motion of dynamic representations to decline the potential energy within a\nforce-driven physical model, we can acquire an enhanced and more stable\nobservation for estimating transferability. The experimental results on 10\ndownstream tasks and 12 self-supervised models demonstrate that our approach\ncan seamlessly integrate into existing ranking techniques and enhance their\nperformances, revealing its effectiveness for the model selection task and its\npotential for understanding the mechanism in transfer learning. Code will be\navailable at https://github.com/lixiaotong97/PED.\n","authors":["Xiaotong Li","Zixuan Hu","Yixiao Ge","Ying Shan","Ling-Yu Duan"],"pdf_url":"https://arxiv.org/pdf/2308.15074v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15072v1","updated":"2023-08-29T07:13:31Z","published":"2023-08-29T07:13:31Z","title":"Advancing Adversarial Robustness Through Adversarial Logit Update","summary":" Deep Neural Networks are susceptible to adversarial perturbations.\nAdversarial training and adversarial purification are among the most widely\nrecognized defense strategies. Although these methods have different underlying\nlogic, both rely on absolute logit values to generate label predictions. In\nthis study, we theoretically analyze the logit difference around successful\nadversarial attacks from a theoretical point of view and propose a new\nprinciple, namely Adversarial Logit Update (ALU), to infer adversarial sample's\nlabels. Based on ALU, we introduce a new classification paradigm that utilizes\npre- and post-purification logit differences for model's adversarial robustness\nboost. Without requiring adversarial or additional data for model training, our\nclean data synthesis model can be easily applied to various pre-trained models\nfor both adversarial sample detection and ALU-based data classification.\nExtensive experiments on both CIFAR-10, CIFAR-100, and tiny-ImageNet datasets\nshow that even with simple components, the proposed solution achieves superior\nrobustness performance compared to state-of-the-art methods against a wide\nrange of adversarial attacks. Our python implementation is submitted in our\nSupplementary document and will be published upon the paper's acceptance.\n","authors":["Hao Xuan","Peican Zhu","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2308.15072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06815v2","updated":"2023-08-29T07:09:16Z","published":"2023-04-13T20:49:35Z","title":"Improving Few-Shot Prompts with Relevant Static Analysis Products","summary":" Large Language Models (LLM) are a new class of computation engines,\n\"programmed\" via prompt engineering. We are still learning how to best\n\"program\" these LLMs to help developers. We start with the intuition that\ndevelopers tend to consciously and unconsciously have a collection of semantics\nfacts in mind when working on coding tasks. Mostly these are shallow, simple\nfacts arising from a quick read. For a function, examples of facts might\ninclude parameter and local variable names, return expressions, simple pre- and\npost-conditions, and basic control and data flow, etc.\n One might assume that the powerful multi-layer architecture of\ntransformer-style LLMs makes them inherently capable of doing this simple level\nof \"code analysis\" and extracting such information, implicitly, while\nprocessing code: but are they, really? If they aren't, could explicitly adding\nthis information help? Our goal here is to investigate this question, using the\ncode summarization task and evaluate whether automatically augmenting an LLM's\nprompt with semantic facts explicitly, actually helps.\n Prior work shows that LLM performance on code summarization benefits from\nfew-shot samples drawn either from the same-project or from examples found via\ninformation retrieval methods (such as BM25). While summarization performance\nhas steadily increased since the early days, there is still room for\nimprovement: LLM performance on code summarization still lags its performance\non natural-language tasks like translation and text summarization.\n We find that adding semantic facts actually does help! This approach improves\nperformance in several different settings suggested by prior work, including\nfor two different Large Language Models. In most cases, improvement nears or\nexceeds 2 BLEU; for the PHP language in the challenging CodeSearchNet dataset,\nthis augmentation actually yields performance surpassing 30 BLEU.\n","authors":["Toufique Ahmed","Kunal Suresh Pai","Premkumar Devanbu","Earl T. Barr"],"pdf_url":"https://arxiv.org/pdf/2304.06815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15069v1","updated":"2023-08-29T07:04:50Z","published":"2023-08-29T07:04:50Z","title":"MadSGM: Multivariate Anomaly Detection with Score-based Generative\n Models","summary":" The time-series anomaly detection is one of the most fundamental tasks for\ntime-series. Unlike the time-series forecasting and classification, the\ntime-series anomaly detection typically requires unsupervised (or\nself-supervised) training since collecting and labeling anomalous observations\nare difficult. In addition, most existing methods resort to limited forms of\nanomaly measurements and therefore, it is not clear whether they are optimal in\nall circumstances. To this end, we present a multivariate time-series anomaly\ndetector based on score-based generative models, called MadSGM, which considers\nthe broadest ever set of anomaly measurement factors: i) reconstruction-based,\nii) density-based, and iii) gradient-based anomaly measurements. We also design\na conditional score network and its denoising score matching loss for the\ntime-series anomaly detection. Experiments on five real-world benchmark\ndatasets illustrate that MadSGM achieves the most robust and accurate\npredictions.\n","authors":["Haksoo Lim","Sewon Park","Minjung Kim","Jaehoon Lee","Seonkyu Lim","Noseong Park"],"pdf_url":"https://arxiv.org/pdf/2308.15069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15059v1","updated":"2023-08-29T06:43:29Z","published":"2023-08-29T06:43:29Z","title":"OEBench: Investigating Open Environment Challenges in Real-World\n Relational Data Streams","summary":" Relational datasets are widespread in real-world scenarios and are usually\ndelivered in a streaming fashion. This type of data stream can present unique\nchallenges, such as distribution drifts, outliers, emerging classes, and\nchanging features, which have recently been described as open environment\nchallenges for machine learning. While some work has been done on incremental\nlearning for data streams, their evaluations are mostly conducted with manually\npartitioned datasets. Moreover, while several real-world streaming datasets are\navailable, it is uncertain whether these open environment challenges are\nprevalent and how existing incremental learning algorithms perform on real\ndatasets. To fill this gap, we develop an Open Environment Benchmark named\nOEBench to evaluate open environment challenges in relational data streams.\nSpecifically, we investigate 55 real-world streaming datasets and establish\nthat open environment scenarios are indeed widespread in real-world datasets,\nwhich presents significant challenges for stream learning algorithms. Through\nbenchmarks, we find that increased data quantity may not consistently enhance\nthe model accuracy when applied in open environment scenarios, where machine\nlearning models can be significantly compromised by distribution shifts,\nanomalies, or untrustworthy data within real-world data streams. The current\ntechniques are insufficient in effectively mitigating these challenges posed by\nopen environments. Thus, it is promising to conduct more researches to address\nreal-world new challenges of open environment scenarios.\n","authors":["Yiqun Diao","Yutong Yang","Qinbin Li","Bingsheng He","Mian Lu"],"pdf_url":"https://arxiv.org/pdf/2308.15059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.06714v4","updated":"2023-08-29T06:41:58Z","published":"2022-01-18T03:13:19Z","title":"AdaTerm: Adaptive T-Distribution Estimated Robust Moments for\n Noise-Robust Stochastic Gradient Optimization","summary":" With the increasing practicality of deep learning applications, practitioners\nare inevitably faced with datasets corrupted by noise from various sources such\nas measurement errors, mislabeling, and estimated surrogate inputs/outputs that\ncan adversely impact the optimization results. It is a common practice to\nimprove the optimization algorithm's robustness to noise, since this algorithm\nis ultimately in charge of updating the network parameters. Previous studies\nrevealed that the first-order moment used in Adam-like stochastic gradient\ndescent optimizers can be modified based on the Student's t-distribution. While\nthis modification led to noise-resistant updates, the other associated\nstatistics remained unchanged, resulting in inconsistencies in the assumed\nmodels. In this paper, we propose AdaTerm, a novel approach that incorporates\nthe Student's t-distribution to derive not only the first-order moment but also\nall the associated statistics. This provides a unified treatment of the\noptimization process, offering a comprehensive framework under the statistical\nmodel of the t-distribution for the first time. The proposed approach offers\nseveral advantages over previously proposed approaches, including reduced\nhyperparameters and improved robustness and adaptability. This noise-adaptive\nbehavior contributes to AdaTerm's exceptional learning performance, as\ndemonstrated through various optimization problems with different and/or\nunknown noise ratios. Furthermore, we introduce a new technique for deriving a\ntheoretical regret bound without relying on AMSGrad, providing a valuable\ncontribution to the field\n","authors":["Wendyam Eric Lionel Ilboudo","Taisuke Kobayashi","Takamitsu Matsubara"],"pdf_url":"https://arxiv.org/pdf/2201.06714v4.pdf","comment":"27 pages; Final version accepted by Elsevier Neurocomputing Journal\n (2023-08; https://doi.org/10.1016/j.neucom.2023.126692)"},{"id":"http://arxiv.org/abs/2011.01710v3","updated":"2023-08-29T06:39:04Z","published":"2020-11-03T13:54:01Z","title":"Ballistocardiogram artifact removal in simultaneous EEG-fMRI using\n generative adversarial network","summary":" Due to its advantages of high temporal and spatial resolution, the technology\nof simultaneous electroencephalogram-functional magnetic resonance imaging\n(EEG-fMRI) acquisition and analysis has attracted much attention, and has been\nwidely used in various research fields of brain science. However, during the\nfMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate\nthe EEG. As an unpaired problem, BCG artifact removal now remains a\nconsiderable challenge. Aiming to provide a solution, this paper proposed a\nnovel modular generative adversarial network (GAN) and corresponding training\nstrategy to improve the network performance by optimizing the parameters of\neach module. In this manner, we hope to improve the local representation\nability of the network model, thereby improving its overall performance and\nobtaining a reliable generator for BCG artifact removal. Moreover, the proposed\nmethod does not rely on additional reference signal or complex hardware\nequipment. Experimental results show that, compared with multiple methods, the\ntechnique presented in this paper can remove the BCG artifact more effectively\nwhile retaining essential EEG information.\n","authors":["Guang Lin","Jianhai Zhang","Yuxi Liu","Tianyang Gao","Wanzeng Kong","Xu Lei","Tao Qiu"],"pdf_url":"https://arxiv.org/pdf/2011.01710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15055v1","updated":"2023-08-29T06:31:21Z","published":"2023-08-29T06:31:21Z","title":"Taxonomic Loss for Morphological Glossing of Low-Resource Languages","summary":" Morpheme glossing is a critical task in automated language documentation and\ncan benefit other downstream applications greatly. While state-of-the-art\nglossing systems perform very well for languages with large amounts of existing\ndata, it is more difficult to create useful models for low-resource languages.\nIn this paper, we propose the use of a taxonomic loss function that exploits\nmorphological information to make morphological glossing more performant when\ndata is scarce. We find that while the use of this loss function does not\noutperform a standard loss function with regards to single-label prediction\naccuracy, it produces better predictions when considering the top-n predicted\nlabels. We suggest this property makes the taxonomic loss function useful in a\nhuman-in-the-loop annotation setting.\n","authors":["Michael Ginn","Alexis Palmer"],"pdf_url":"https://arxiv.org/pdf/2308.15055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06564v2","updated":"2023-08-29T06:25:48Z","published":"2023-08-12T13:17:09Z","title":"EquiDiff: A Conditional Equivariant Diffusion Model For Trajectory\n Prediction","summary":" Accurate trajectory prediction is crucial for the safe and efficient\noperation of autonomous vehicles. The growing popularity of deep learning has\nled to the development of numerous methods for trajectory prediction. While\ndeterministic deep learning models have been widely used, deep generative\nmodels have gained popularity as they learn data distributions from training\ndata and account for trajectory uncertainties. In this study, we propose\nEquiDiff, a deep generative model for predicting future vehicle trajectories.\nEquiDiff is based on the conditional diffusion model, which generates future\ntrajectories by incorporating historical information and random Gaussian noise.\nThe backbone model of EquiDiff is an SO(2)-equivariant transformer that fully\nutilizes the geometric properties of location coordinates. In addition, we\nemploy Recurrent Neural Networks and Graph Attention Networks to extract social\ninteractions from historical trajectories. To evaluate the performance of\nEquiDiff, we conduct extensive experiments on the NGSIM dataset. Our results\ndemonstrate that EquiDiff outperforms other baseline models in short-term\nprediction, but has slightly higher errors for long-term prediction.\nFurthermore, we conduct an ablation study to investigate the contribution of\neach component of EquiDiff to the prediction accuracy. Additionally, we present\na visualization of the generation process of our diffusion model, providing\ninsights into the uncertainty of the prediction.\n","authors":["Kehua Chen","Xianda Chen","Zihan Yu","Meixin Zhu","Hai Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15050v1","updated":"2023-08-29T06:20:36Z","published":"2023-08-29T06:20:36Z","title":"iBARLE: imBalance-Aware Room Layout Estimation","summary":" Room layout estimation predicts layouts from a single panorama. It requires\ndatasets with large-scale and diverse room shapes to train the models. However,\nthere are significant imbalances in real-world datasets including the\ndimensions of layout complexity, camera locations, and variation in scene\nappearance. These issues considerably influence the model training performance.\nIn this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE)\nframework to address these issues. iBARLE consists of (1) Appearance Variation\nGeneration (AVG) module, which promotes visual appearance domain\ngeneralization, (2) Complex Structure Mix-up (CSMix) module, which enhances\ngeneralizability w.r.t. room structure, and (3) a gradient-based layout\nobjective function, which allows more effective accounting for occlusions in\ncomplex layouts. All modules are jointly trained and help each other to achieve\nthe best performance. Experiments and ablation studies based on\nZInD~\\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art\nperformance compared with other layout estimation baselines.\n","authors":["Taotao Jing","Lichen Wang","Naji Khosravan","Zhiqiang Wan","Zachary Bessinger","Zhengming Ding","Sing Bing Kang"],"pdf_url":"https://arxiv.org/pdf/2308.15050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09121v2","updated":"2023-08-29T06:11:58Z","published":"2023-05-16T03:00:04Z","title":"A Conditional Denoising Diffusion Probabilistic Model for Radio\n Interferometric Image Reconstruction","summary":" In radio astronomy, signals from radio telescopes are transformed into images\nof observed celestial objects, or sources. However, these images, called dirty\nimages, contain real sources as well as artifacts due to signal sparsity and\nother factors. Therefore, radio interferometric image reconstruction is\nperformed on dirty images, aiming to produce clean images in which artifacts\nare reduced and real sources are recovered. So far, existing methods have\nlimited success on recovering faint sources, preserving detailed structures,\nand eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and\nImage Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to\nuse both the original visibility data in the spectral domain and dirty images\nin the spatial domain to guide the image generation process with DDPM. This\nway, we can leverage DDPM to generate fine details and eliminate noise, while\nutilizing visibility data to separate signals from noise and retaining spatial\ninformation in dirty images. We have conducted experiments in comparison with\nboth traditional methods and recent deep learning based approaches. Our results\nshow that our method significantly improves the resulting images by reducing\nartifacts, preserving fine details, and recovering dim sources. This\nadvancement further facilitates radio astronomical data analysis tasks on\ncelestial phenomena.\n","authors":["Ruoqi Wang","Zhuoyang Chen","Qiong Luo","Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2305.09121v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.15047v1","updated":"2023-08-29T06:09:47Z","published":"2023-08-29T06:09:47Z","title":"Large language models converge toward human-like concept organization","summary":" Large language models show human-like performance in knowledge extraction,\nreasoning and dialogue, but it remains controversial whether this performance\nis best explained by memorization and pattern matching, or whether it reflects\nhuman-like inferential semantics and world knowledge. Knowledge bases such as\nWikiData provide large-scale, high-quality representations of inferential\nsemantics and world knowledge. We show that large language models learn to\norganize concepts in ways that are strikingly similar to how concepts are\norganized in such knowledge bases. Knowledge bases model collective,\ninstitutional knowledge, and large language models seem to induce such\nknowledge from raw text. We show that bigger and better models exhibit more\nhuman-like concept organization, across four families of language models and\nthree knowledge graph embeddings.\n","authors":["Mathias Lykke Gammelgaard","Jonathan Gabel Christiansen","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2308.15047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10875v2","updated":"2023-08-29T05:42:49Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":" With the growth of 3D sensing technology, deep learning system for 3D point\nclouds has become increasingly important, especially in applications like\nautonomous vehicles where safety is a primary concern. However, there are also\ngrowing concerns about the reliability of these systems when they encounter\nnoisy point clouds, whether occurring naturally or introduced with malicious\nintent. This paper highlights the challenges of point cloud classification\nposed by various forms of noise, from simple background noise to malicious\nbackdoor attacks that can intentionally skew model predictions. While there's\nan urgent need for optimized point cloud denoising, current point outlier\nremoval approaches, an essential step for denoising, rely heavily on\nhandcrafted strategies and are not adapted for higher-level tasks, such as\nclassification. To address this issue, we introduce an innovative point outlier\ncleansing method that harnesses the power of downstream classification models.\nBy employing gradient-based attribution analysis, we define a novel concept:\npoint risk. Drawing inspiration from tail risk minimization in finance, we\nrecast the outlier removal process as an optimization problem, named PointCVaR.\nExtensive experiments show that our proposed technique not only robustly\nfilters diverse point cloud outliers but also consistently and significantly\nenhances existing robust methods for point cloud classification.\n","authors":["Xinke Li","Junchi Lu","Henghui Ding","Changsheng Sun","Joey Tianyi Zhou","Chee Yeow Meng"],"pdf_url":"https://arxiv.org/pdf/2307.10875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12095v5","updated":"2023-08-29T05:34:25Z","published":"2023-02-22T11:01:20Z","title":"On the Robustness of ChatGPT: An Adversarial and Out-of-distribution\n Perspective","summary":" ChatGPT is a recent chatbot service released by OpenAI and is receiving\nincreasing attention over the past few months. While evaluations of various\naspects of ChatGPT have been done, its robustness, i.e., the performance to\nunexpected inputs, is still unclear to the public. Robustness is of particular\nconcern in responsible AI, especially for safety-critical applications. In this\npaper, we conduct a thorough evaluation of the robustness of ChatGPT from the\nadversarial and out-of-distribution (OOD) perspective. To do so, we employ the\nAdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart\nreview and DDXPlus medical diagnosis datasets for OOD evaluation. We select\nseveral popular foundation models as baselines. Results show that ChatGPT shows\nconsistent advantages on most adversarial and OOD classification and\ntranslation tasks. However, the absolute performance is far from perfection,\nwhich suggests that adversarial and OOD robustness remains a significant threat\nto foundation models. Moreover, ChatGPT shows astounding performance in\nunderstanding dialogue-related texts and we find that it tends to provide\ninformal suggestions for medical tasks instead of definitive answers. Finally,\nwe present in-depth discussions of possible research directions.\n","authors":["Jindong Wang","Xixu Hu","Wenxin Hou","Hao Chen","Runkai Zheng","Yidong Wang","Linyi Yang","Haojun Huang","Wei Ye","Xiubo Geng","Binxin Jiao","Yue Zhang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2302.12095v5.pdf","comment":"Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable\n Large-Scale Machine Learning Models; code is at:\n https://github.com/microsoft/robustlearn; more works:\n https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2308.15020v1","updated":"2023-08-29T04:50:07Z","published":"2023-08-29T04:50:07Z","title":"Massively Parallel Continuous Local Search for Hybrid SAT Solving on\n GPUs","summary":" Although state-of-the-art (SOTA) SAT solvers based on conflict-driven clause\nlearning (CDCL) have achieved remarkable engineering success, their sequential\nnature limits the parallelism that may be extracted for acceleration on\nplatforms such as the graphics processing unit (GPU). In this work, we propose\nFastFourierSAT, a highly parallel hybrid SAT solver based on gradient-driven\ncontinuous local search (CLS). This is realized by a novel parallel algorithm\ninspired by the Fast Fourier Transform (FFT)-based convolution for computing\nthe elementary symmetric polynomials (ESPs), which is the major computational\ntask in previous CLS methods. The complexity of our algorithm matches the best\nprevious result. Furthermore, the substantial parallelism inherent in our\nalgorithm can leverage the GPU for acceleration, demonstrating significant\nimprovement over the previous CLS approaches. We also propose to incorporate\nthe restart heuristics in CLS to improve search efficiency. We compare our\napproach with the SOTA parallel SAT solvers on several benchmarks. Our results\nshow that FastFourierSAT computes the gradient 100+ times faster than previous\nprototypes implemented on CPU. Moreover, FastFourierSAT solves most instances\nand demonstrates promising performance on larger-size instances.\n","authors":["Yunuo Cen","Zhiwei Zhang","Xuanyao Fong"],"pdf_url":"https://arxiv.org/pdf/2308.15020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11872v3","updated":"2023-08-29T04:00:03Z","published":"2022-06-22T17:47:41Z","title":"Provable Acceleration of Heavy Ball beyond Quadratics for a Class of\n Polyak-Łojasiewicz Functions when the Non-Convexity is Averaged-Out","summary":" Heavy Ball (HB) nowadays is one of the most popular momentum methods in\nnon-convex optimization. It has been widely observed that incorporating the\nHeavy Ball dynamic in gradient-based methods accelerates the training process\nof modern machine learning models. However, the progress on establishing its\ntheoretical foundation of acceleration is apparently far behind its empirical\nsuccess. Existing provable acceleration results are of the quadratic or\nclose-to-quadratic functions, as the current techniques of showing HB's\nacceleration are limited to the case when the Hessian is fixed. In this work,\nwe develop some new techniques that help show acceleration beyond quadratics,\nwhich is achieved by analyzing how the change of the Hessian at two consecutive\ntime points affects the convergence speed. Based on our technical results, a\nclass of Polyak-\\L{}ojasiewicz (PL) optimization problems for which provable\nacceleration can be achieved via HB is identified. Moreover, our analysis\ndemonstrates a benefit of adaptively setting the momentum parameter.\n (Update: 08/29/2023) Erratum is added in Appendix J. This is an updated\nversion that fixes an issue in the previous version. An additional condition\nneeds to be satisfied for the acceleration result of HB beyond quadratics in\nthis work, which naturally holds when the dimension is one or, more broadly,\nwhen the Hessian is diagonal. We elaborate on the issue in Appendix J.\n","authors":["Jun-Kun Wang","Chi-Heng Lin","Andre Wibisono","Bin Hu"],"pdf_url":"https://arxiv.org/pdf/2206.11872v3.pdf","comment":"(ICML 2022) Proceedings of the 39th International Conference on\n Machine Learning;"},{"id":"http://arxiv.org/abs/2308.15006v1","updated":"2023-08-29T03:54:53Z","published":"2023-08-29T03:54:53Z","title":"Exploiting Problem Geometry in Safe Linear Bandits","summary":" The safe linear bandit problem is a version of the classic linear bandit\nproblem where the learner's actions must satisfy an uncertain linear constraint\nat all rounds. Due its applicability to many real-world settings, this problem\nhas received considerable attention in recent years. We find that by exploiting\nthe geometry of the specific problem setting, we can achieve improved regret\nguarantees for both well-separated problem instances and action sets that are\nfinite star convex sets. Additionally, we propose a novel algorithm for this\nsetting that chooses problem parameters adaptively and enjoys at least as good\nregret guarantees as existing algorithms. Lastly, we introduce a generalization\nof the safe linear bandit setting where the constraints are convex and adapt\nour algorithms and analyses to this setting by leveraging a novel\nconvex-analysis based approach. Simulation results show improved performance\nover existing algorithms for a variety of randomly sampled settings.\n","authors":["Spencer Hutchinson","Berkay Turan","Mahnoosh Alizadeh"],"pdf_url":"https://arxiv.org/pdf/2308.15006v1.pdf","comment":"38 pages, 4 figures"},{"id":"http://arxiv.org/abs/1909.04883v4","updated":"2023-08-29T03:36:39Z","published":"2019-09-11T07:30:53Z","title":"Semi-supervised Vector-valued Learning: Improved Bounds and Algorithms","summary":" Vector-valued learning, where the output space admits a vector-valued\nstructure, is an important problem that covers a broad family of important\ndomains, e.g. multi-task learning and transfer learning. Using local Rademacher\ncomplexity and unlabeled data, we derive novel semi-supervised excess risk\nbounds for general vector-valued learning from both kernel perspective and\nlinear perspective. The derived bounds are much sharper than existing ones and\nthe convergence rates are improved from the square root of labeled sample size\nto the square root of total sample size or directly dependent on labeled sample\nsize. Motivated by our theoretical analysis, we propose a general\nsemi-supervised algorithm for efficiently learning vector-valued functions,\nincorporating both local Rademacher complexity and Laplacian regularization.\nExtensive experimental results illustrate the proposed algorithm significantly\noutperforms the compared methods, which coincides with our theoretical\nfindings.\n","authors":["Jian Li","Yong Liu","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/1909.04883v4.pdf","comment":"Accepted at Pattern Recognition"},{"id":"http://arxiv.org/abs/2307.00290v2","updated":"2023-08-29T03:31:58Z","published":"2023-07-01T10:12:46Z","title":"All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with\n Prompt-based Finetuning","summary":" The Segment Anything Model (SAM) is a recently proposed prompt-based\nsegmentation model in a generic zero-shot segmentation approach. With the\nzero-shot segmentation capacity, SAM achieved impressive flexibility and\nprecision on various segmentation tasks. However, the current pipeline requires\nmanual prompts during the inference stage, which is still resource intensive\nfor biomedical image segmentation. In this paper, instead of using prompts\nduring the inference stage, we introduce a pipeline that utilizes the SAM,\ncalled all-in-SAM, through the entire AI development workflow (from annotation\ngeneration to model finetuning) without requiring manual prompts during the\ninference stage. Specifically, SAM is first employed to generate pixel-level\nannotations from weak prompts (e.g., points, bounding box). Then, the\npixel-level annotations are used to finetune the SAM segmentation model rather\nthan training from scratch. Our experimental results reveal two key findings:\n1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a\nnuclei segmentation task on the public Monuseg dataset, and 2) the utilization\nof weak and few annotations for SAM finetuning achieves competitive performance\ncompared to using strong pixel-wise annotated data.\n","authors":["Can Cui","Ruining Deng","Quan Liu","Tianyuan Yao","Shunxing Bao","Lucas W. Remedios","Yucheng Tang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2307.00290v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15860v2","updated":"2023-08-29T03:13:32Z","published":"2023-03-28T10:05:06Z","title":"The Wyner Variational Autoencoder for Unsupervised Multi-Layer Wireless\n Fingerprinting","summary":" Wireless fingerprinting refers to a device identification method leveraging\nhardware imperfections and wireless channel variations as signatures. Beyond\nphysical layer characteristics, recent studies demonstrated that user behaviors\ncould be identified through network traffic, e.g., packet length, without\ndecryption of the payload. Inspired by these results, we propose a multi-layer\nfingerprinting framework that jointly considers the multi-layer signatures for\nimproved identification performance. In contrast to previous works, by\nleveraging the recent multi-view machine learning paradigm, i.e., data with\nmultiple forms, our method can cluster the device information shared among the\nmulti-layer features without supervision. Our information-theoretic approach\ncan be extended to supervised and semi-supervised settings with straightforward\nderivations. In solving the formulated problem, we obtain a tight surrogate\nbound using variational inference for efficient optimization. In extracting the\nshared device information, we develop an algorithm based on the Wyner common\ninformation method, enjoying reduced computation complexity as compared to\nexisting approaches. The algorithm can be applied to data distributions\nbelonging to the exponential family class. Empirically, we evaluate the\nalgorithm in a synthetic dataset with real-world video traffic and simulated\nphysical layer characteristics. Our empirical results show that the proposed\nmethod outperforms the state-of-the-art baselines in both supervised and\nunsupervised settings.\n","authors":["Teng-Hui Huang","Thilini Dahanayaka","Kanchana Thilakarathna","Philip H. W. Leong","Hesham El Gamal"],"pdf_url":"https://arxiv.org/pdf/2303.15860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14995v1","updated":"2023-08-29T02:50:36Z","published":"2023-08-29T02:50:36Z","title":"WSAM: Visual Explanations from Style Augmentation as Adversarial\n Attacker and Their Influence in Image Classification","summary":" Currently, style augmentation is capturing attention due to convolutional\nneural networks (CNN) being strongly biased toward recognizing textures rather\nthan shapes. Most existing styling methods either perform a low-fidelity style\ntransfer or a weak style representation in the embedding vector. This paper\noutlines a style augmentation algorithm using stochastic-based sampling with\nnoise addition to improving randomization on a general linear transformation\nfor style transfer. With our augmentation strategy, all models not only present\nincredible robustness against image stylizing but also outperform all previous\nmethods and surpass the state-of-the-art performance for the STL-10 dataset. In\naddition, we present an analysis of the model interpretations under different\nstyle variations. At the same time, we compare comprehensive experiments\ndemonstrating the performance when applied to deep neural architectures in\ntraining settings.\n","authors":["Felipe Moreno-Vera","Edgar Medina","Jorge Poco"],"pdf_url":"https://arxiv.org/pdf/2308.14995v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2207.03364v4","updated":"2023-08-29T02:44:33Z","published":"2022-07-07T15:12:02Z","title":"Group Equality in Adaptive Submodular Maximization","summary":" In this paper, we study the classic submodular maximization problem subject\nto a group equality constraint under both non-adaptive and adaptive settings.\nIt has been shown that the utility function of many machine learning\napplications, including data summarization, influence maximization in social\nnetworks, and personalized recommendation, satisfies the property of\nsubmodularity. Hence, maximizing a submodular function subject to various\nconstraints can be found at the heart of many of those applications. On a high\nlevel, submodular maximization aims to select a group of most representative\nitems (e.g., data points). However, the design of most existing algorithms does\nnot incorporate the fairness constraint, leading to under- or\nover-representation of some particular groups. This motivates us to study the\nsubmodular maximization problem with group equality, where we aim to select a\ngroup of items to maximize a (possibly non-monotone) submodular utility\nfunction subject to a group equality constraint. To this end, we develop the\nfirst constant-factor approximation algorithm for this problem. The design of\nour algorithm is robust enough to be extended to solving the submodular\nmaximization problem under a more complicated adaptive setting. Moreover, we\nfurther extend our study to incorporating a global cardinality constraint and\nother fairness notations.\n","authors":["Shaojie Tang","Jing Yuan"],"pdf_url":"https://arxiv.org/pdf/2207.03364v4.pdf","comment":"This paper has been accepted by INFORMS Journal on Computing"},{"id":"http://arxiv.org/abs/2308.14991v1","updated":"2023-08-29T02:43:58Z","published":"2023-08-29T02:43:58Z","title":"Incorporating Neuro-Inspired Adaptability for Continual Learning in\n Artificial Intelligence","summary":" Continual learning aims to empower artificial intelligence (AI) with strong\nadaptability to the real world. For this purpose, a desirable solution should\nproperly balance memory stability with learning plasticity, and acquire\nsufficient compatibility to capture the observed distributions. Existing\nadvances mainly focus on preserving memory stability to overcome catastrophic\nforgetting, but remain difficult to flexibly accommodate incremental changes as\nbiological intelligence (BI) does. By modeling a robust Drosophila learning\nsystem that actively regulates forgetting with multiple learning modules, here\nwe propose a generic approach that appropriately attenuates old memories in\nparameter distributions to improve learning plasticity, and accordingly\ncoordinates a multi-learner architecture to ensure solution compatibility.\nThrough extensive theoretical and empirical validation, our approach not only\nclearly enhances the performance of continual learning, especially over\nsynaptic regularization methods in task-incremental settings, but also\npotentially advances the understanding of neurological adaptive mechanisms,\nserving as a novel paradigm to progress AI and BI together.\n","authors":["Liyuan Wang","Xingxing Zhang","Qian Li","Mingtian Zhang","Hang Su","Jun Zhu","Yi Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.14991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14983v1","updated":"2023-08-29T02:23:58Z","published":"2023-08-29T02:23:58Z","title":"Constructive Incremental Learning for Fault Diagnosis of Rolling\n Bearings with Ensemble Domain Adaptation","summary":" Given the prevalence of rolling bearing fault diagnosis as a practical issue\nacross various working conditions, the limited availability of samples\ncompounds the challenge. Additionally, the complexity of the external\nenvironment and the structure of rolling bearings often manifests faults\ncharacterized by randomness and fuzziness, hindering the effective extraction\nof fault characteristics and restricting the accuracy of fault diagnosis. To\novercome these problems, this paper presents a novel approach termed\nconstructive Incremental learning-based ensemble domain adaptation (CIL-EDA)\napproach. Specifically, it is implemented on stochastic configuration networks\n(SCN) to constructively improve its adaptive performance in multi-domains.\nConcretely, a cloud feature extraction method is employed in conjunction with\nwavelet packet decomposition (WPD) to capture the uncertainty of fault\ninformation from multiple resolution aspects. Subsequently, constructive\nIncremental learning-based domain adaptation (CIL-DA) is firstly developed to\nenhance the cross-domain learning capability of each hidden node through domain\nmatching and construct a robust fault classifier by leveraging limited labeled\ndata from both target and source domains. Finally, fault diagnosis results are\nobtained by a majority voting of CIL-EDA which integrates CIL-DA and parallel\nensemble learning. Experimental results demonstrate that our CIL-DA outperforms\nseveral domain adaptation methods and CIL-EDA consistently outperforms\nstate-of-art fault diagnosis methods in few-shot scenarios.\n","authors":["Jiang Liu","Wei Dai"],"pdf_url":"https://arxiv.org/pdf/2308.14983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14981v1","updated":"2023-08-29T02:16:48Z","published":"2023-08-29T02:16:48Z","title":"Sub-universal variational circuits for combinatorial optimization\n problems","summary":" Quantum variational circuits have gained significant attention due to their\napplications in the quantum approximate optimization algorithm and quantum\nmachine learning research. This work introduces a novel class of classical\nprobabilistic circuits designed for generating approximate solutions to\ncombinatorial optimization problems constructed using two-bit stochastic\nmatrices. Through a numerical study, we investigate the performance of our\nproposed variational circuits in solving the Max-Cut problem on various graphs\nof increasing sizes. Our classical algorithm demonstrates improved performance\nfor several graph types to the quantum approximate optimization algorithm. Our\nfindings suggest that evaluating the performance of quantum variational\ncircuits against variational circuits with sub-universal gate sets is a\nvaluable benchmark for identifying areas where quantum variational circuits can\nexcel.\n","authors":["Gal Weitz","Lirandë Pira","Chris Ferrie","Joshua Combes"],"pdf_url":"https://arxiv.org/pdf/2308.14981v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.14976v1","updated":"2023-08-29T02:05:40Z","published":"2023-08-29T02:05:40Z","title":"Efficient labeling of solar flux evolution videos by a deep learning\n model","summary":" Machine learning (ML) is becoming a critical tool for interrogation of large\ncomplex data. Labeling, defined as the process of adding meaningful\nannotations, is a crucial step of supervised ML. However, labeling datasets is\ntime consuming. Here we show that convolutional neural networks (CNNs), trained\non crudely labeled astronomical videos, can be leveraged to improve the quality\nof data labeling and reduce the need for human intervention. We use videos of\nthe solar magnetic field, crudely labeled into two classes: emergence or\nnon-emergence of bipolar magnetic regions (BMRs), based on their first\ndetection on the solar disk. We train CNNs using crude labels, manually verify,\ncorrect labeling vs. CNN disagreements, and repeat this process until\nconvergence. Traditionally, flux emergence labelling is done manually. We find\nthat a high-quality labeled dataset, derived through this iterative process,\nreduces the necessary manual verification by 50%. Furthermore, by gradually\nmasking the videos and looking for maximum change in CNN inference, we locate\nBMR emergence time without retraining the CNN. This demonstrates the\nversatility of CNNs for simplifying the challenging task of labeling complex\ndynamic events.\n","authors":["Subhamoy Chatterjee","Andrés Muñoz-Jaramillo","Derek A. Lamb"],"pdf_url":"https://arxiv.org/pdf/2308.14976v1.pdf","comment":"16 pages, 7 figures, published in Nature Astronomy, June 27, 2022"},{"id":"http://arxiv.org/abs/2308.14328v2","updated":"2023-08-29T01:58:02Z","published":"2023-08-28T06:15:14Z","title":"Reinforcement Learning for Generative AI: A Survey","summary":" Deep Generative AI has been a long-standing essential topic in the machine\nlearning community, which can impact a number of application areas like text\ngeneration and computer vision. The major paradigm to train a generative model\nis maximum likelihood estimation, which pushes the learner to capture and\napproximate the target data distribution by decreasing the divergence between\nthe model distribution and the target distribution. This formulation\nsuccessfully establishes the objective of generative tasks, while it is\nincapable of satisfying all the requirements that a user might expect from a\ngenerative model. Reinforcement learning, serving as a competitive option to\ninject new training signals by creating new objectives that exploit novel\nsignals, has demonstrated its power and flexibility to incorporate human\ninductive bias from multiple angles, such as adversarial learning,\nhand-designed rules and learned reward model to build a performant model.\nThereby, reinforcement learning has become a trending research field and has\nstretched the limits of generative AI in both model design and application. It\nis reasonable to summarize and conclude advances in recent years with a\ncomprehensive review. Although there are surveys in different application areas\nrecently, this survey aims to shed light on a high-level review that spans a\nrange of application areas. We provide a rigorous taxonomy in this area and\nmake sufficient coverage on various models and applications. Notably, we also\nsurveyed the fast-developing large language model area. We conclude this survey\nby showing the potential directions that might tackle the limit of current\nmodels and expand the frontiers for generative AI.\n","authors":["Yuanjiang Cao","Quan Z. Sheng","Julian McAuley","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2308.14328v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14971v1","updated":"2023-08-29T01:53:14Z","published":"2023-08-29T01:53:14Z","title":"Distributed multi-agent target search and tracking with Gaussian process\n and reinforcement learning","summary":" Deploying multiple robots for target search and tracking has many practical\napplications, yet the challenge of planning over unknown or partially known\ntargets remains difficult to address. With recent advances in deep learning,\nintelligent control techniques such as reinforcement learning have enabled\nagents to learn autonomously from environment interactions with little to no\nprior knowledge. Such methods can address the exploration-exploitation tradeoff\nof planning over unknown targets in a data-driven manner, eliminating the\nreliance on heuristics typical of traditional approaches and streamlining the\ndecision-making pipeline with end-to-end training. In this paper, we propose a\nmulti-agent reinforcement learning technique with target map building based on\ndistributed Gaussian process. We leverage the distributed Gaussian process to\nencode belief over the target locations and efficiently plan over unknown\ntargets. We evaluate the performance and transferability of the trained policy\nin simulation and demonstrate the method on a swarm of micro unmanned aerial\nvehicles with hardware experiments.\n","authors":["Jigang Kim","Dohyun Jang","H. Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14971v1.pdf","comment":"10 pages, 6 figures; preprint submitted to IJCAS; first two authors\n contributed equally"},{"id":"http://arxiv.org/abs/2308.14969v1","updated":"2023-08-29T01:47:49Z","published":"2023-08-29T01:47:49Z","title":"Reprogramming under constraints: Revisiting efficient and reliable\n transferability of lottery tickets","summary":" In the era of foundation models with huge pre-training budgets, the\ndownstream tasks have been shifted to the narrative of efficient and fast\nadaptation. For classification-based tasks in the domain of computer vision,\nthe two most efficient approaches have been linear probing (LP) and visual\nprompting/reprogramming (VP); the former aims to learn a classifier in the form\nof a linear head on the features extracted by the pre-trained model, while the\nlatter maps the input data to the domain of the source data on which the model\nwas originally pre-trained on. Although extensive studies have demonstrated the\ndifferences between LP and VP in terms of downstream performance, we explore\nthe capabilities of the two aforementioned methods via the sparsity axis: (a)\nData sparsity: the impact of few-shot adaptation and (b) Model sparsity: the\nimpact of lottery tickets (LT). We demonstrate that LT are not universal\nreprogrammers, i.e., for certain target datasets, reprogramming an LT yields\nsignificantly lower performance than the reprogrammed dense model although\ntheir corresponding upstream performance is similar. Further, we demonstrate\nthat the calibration of dense models is always superior to that of their\nlottery ticket counterparts under both LP and VP regimes. Our empirical study\nopens a new avenue of research into VP for sparse models and encourages further\nunderstanding of the performance beyond the accuracy achieved by VP under\nconstraints of sparsity. Code and logs can be accessed at\n\\url{https://github.com/landskape-ai/Reprogram_LT}.\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.03312v4","updated":"2023-08-29T01:44:39Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":" Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14962v1","updated":"2023-08-29T01:29:26Z","published":"2023-08-29T01:29:26Z","title":"Streaming Compression of Scientific Data via weak-SINDy","summary":" In this paper a streaming weak-SINDy algorithm is developed specifically for\ncompressing streaming scientific data. The production of scientific data,\neither via simulation or experiments, is undergoing an stage of exponential\ngrowth, which makes data compression important and often necessary for storing\nand utilizing large scientific data sets. As opposed to classical ``offline\"\ncompression algorithms that perform compression on a readily available data\nset, streaming compression algorithms compress data ``online\" while the data\ngenerated from simulation or experiments is still flowing through the system.\nThis feature makes streaming compression algorithms well-suited for scientific\ndata compression, where storing the full data set offline is often infeasible.\nThis work proposes a new streaming compression algorithm, streaming weak-SINDy,\nwhich takes advantage of the underlying data characteristics during\ncompression. The streaming weak-SINDy algorithm constructs feature matrices and\ntarget vectors in the online stage via a streaming integration method in a\nmemory efficient manner. The feature matrices and target vectors are then used\nin the offline stage to build a model through a regression process that aims to\nrecover equations that govern the evolution of the data. For compressing\nhigh-dimensional streaming data, we adopt a streaming proper orthogonal\ndecomposition (POD) process to reduce the data dimension and then use the\nstreaming weak-SINDy algorithm to compress the temporal data of the POD\nexpansion. We propose modifications to the streaming weak-SINDy algorithm to\naccommodate the dynamically updated POD basis. By combining the built model\nfrom the streaming weak-SINDy algorithm and a small amount of data samples, the\nfull data flow could be reconstructed accurately at a low memory cost, as shown\nin the numerical tests.\n","authors":["Benjamin P. Russo","M. Paul Laiu","Richard Archibald"],"pdf_url":"https://arxiv.org/pdf/2308.14962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03454v2","updated":"2023-08-29T01:20:04Z","published":"2023-06-06T07:17:56Z","title":"Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems:\n Challenges and Opportunities","summary":" Multi-Sensor Fusion (MSF) based perception systems have been the foundation\nin supporting many industrial applications and domains, such as self-driving\ncars, robotic arms, and unmanned aerial vehicles. Over the past few years, the\nfast progress in data-driven artificial intelligence (AI) has brought a\nfast-increasing trend to empower MSF systems by deep learning techniques to\nfurther improve performance, especially on intelligent systems and their\nperception systems. Although quite a few AI-enabled MSF perception systems and\ntechniques have been proposed, up to the present, limited benchmarks that focus\non MSF perception are publicly available. Given that many intelligent systems\nsuch as self-driving cars are operated in safety-critical contexts where\nperception systems play an important role, there comes an urgent need for a\nmore in-depth understanding of the performance and reliability of these MSF\nsystems. To bridge this gap, we initiate an early step in this direction and\nconstruct a public benchmark of AI-enabled MSF-based perception systems\nincluding three commonly adopted tasks (i.e., object detection, object\ntracking, and depth completion). Based on this, to comprehensively understand\nMSF systems' robustness and reliability, we design 14 common and realistic\ncorruption patterns to synthesize large-scale corrupted datasets. We further\nperform a systematic evaluation of these systems through our large-scale\nevaluation. Our results reveal the vulnerability of the current AI-enabled MSF\nperception systems, calling for researchers and practitioners to take\nrobustness and reliability into account when designing AI-enabled MSF.\n","authors":["Xinyu Gao","Zhijie Wang","Yang Feng","Lei Ma","Zhenyu Chen","Baowen Xu"],"pdf_url":"https://arxiv.org/pdf/2306.03454v2.pdf","comment":"To appear in ESEC/FSE 2023"},{"id":"http://arxiv.org/abs/2306.09539v2","updated":"2023-08-29T01:08:30Z","published":"2023-06-15T22:48:08Z","title":"Block-State Transformer","summary":" State space models (SSMs) have shown impressive results on tasks that require\nmodeling long-range dependencies and efficiently scale to long sequences owing\nto their subquadratic runtime complexity. Originally designed for continuous\nsignals, SSMs have shown superior performance on a plethora of tasks, in vision\nand audio; however, SSMs still lag Transformer performance in Language Modeling\ntasks. In this work, we propose a hybrid layer named Block-State Transformer\n(BST), that internally combines an SSM sublayer for long-range\ncontextualization, and a Block Transformer sublayer for short-term\nrepresentation of sequences. We study three different, and completely\nparallelizable, variants that integrate SSMs and block-wise attention. We show\nthat our model outperforms similar Transformer-based architectures on language\nmodeling perplexity and generalizes to longer sequences. In addition, the\nBlock-State Transformer demonstrates more than tenfold increase in speed at the\nlayer level compared to the Block-Recurrent Transformer when model\nparallelization is employed.\n","authors":["Mahan Fathi","Jonathan Pilault","Pierre-Luc Bacon","Christopher Pal","Orhan Firat","Ross Goroshin"],"pdf_url":"https://arxiv.org/pdf/2306.09539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02003v2","updated":"2023-08-29T00:59:44Z","published":"2023-06-03T05:01:51Z","title":"On Optimal Caching and Model Multiplexing for Large Model Inference","summary":" Large Language Models (LLMs) and other large foundation models have achieved\nnoteworthy success, but their size exacerbates existing resource consumption\nand latency challenges. In particular, the large-scale deployment of these\nmodels is hindered by the significant resource requirements during inference.\nIn this paper, we study two approaches for mitigating these challenges:\nemploying a cache to store previous queries and learning a model multiplexer to\nchoose from an ensemble of models for query processing.\n Theoretically, we provide an optimal algorithm for jointly optimizing both\napproaches to reduce the inference cost in both offline and online tabular\nsettings. By combining a caching algorithm, namely Greedy Dual Size with\nFrequency (GDSF) or Least Expected Cost (LEC), with a model multiplexer, we\nachieve optimal rates in both offline and online settings. Empirically,\nsimulations show that the combination of our caching and model multiplexing\nalgorithms greatly improves over the baselines, with up to $50\\times$\nimprovement over the baseline when the ratio between the maximum cost and\nminimum cost is $100$. Experiments on real datasets show a $4.3\\times$\nimprovement in FLOPs over the baseline when the ratio for FLOPs is $10$, and a\n$1.8\\times$ improvement in latency when the ratio for average latency is\n$1.85$.\n","authors":["Banghua Zhu","Ying Sheng","Lianmin Zheng","Clark Barrett","Michael I. Jordan","Jiantao Jiao"],"pdf_url":"https://arxiv.org/pdf/2306.02003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09297v3","updated":"2023-08-29T00:49:40Z","published":"2023-06-15T17:25:15Z","title":"Fix Fairness, Don't Ruin Accuracy: Performance Aware Fairness Repair\n using AutoML","summary":" Machine learning (ML) is increasingly being used in critical decision-making\nsoftware, but incidents have raised questions about the fairness of ML\npredictions. To address this issue, new tools and methods are needed to\nmitigate bias in ML-based software. Previous studies have proposed bias\nmitigation algorithms that only work in specific situations and often result in\na loss of accuracy. Our proposed solution is a novel approach that utilizes\nautomated machine learning (AutoML) techniques to mitigate bias. Our approach\nincludes two key innovations: a novel optimization function and a\nfairness-aware search space. By improving the default optimization function of\nAutoML and incorporating fairness objectives, we are able to mitigate bias with\nlittle to no loss of accuracy. Additionally, we propose a fairness-aware search\nspace pruning method for AutoML to reduce computational cost and repair time.\nOur approach, built on the state-of-the-art Auto-Sklearn tool, is designed to\nreduce bias in real-world scenarios. In order to demonstrate the effectiveness\nof our approach, we evaluated our approach on four fairness problems and 16\ndifferent ML models, and our results show a significant improvement over the\nbaseline and existing bias mitigation techniques. Our approach, Fair-AutoML,\nsuccessfully repaired 60 out of 64 buggy cases, while existing bias mitigation\ntechniques only repaired up to 44 out of 64 cases.\n","authors":["Giang Nguyen","Sumon Biswas","Hridesh Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.09297v3.pdf","comment":"In Proceedings of The 31st ACM Joint European Software Engineering\n Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE\n 2023)"},{"id":"http://arxiv.org/abs/2102.02409v3","updated":"2023-08-29T00:49:34Z","published":"2021-02-04T04:36:58Z","title":"Variational Inference for Deblending Crowded Starfields","summary":" In images collected by astronomical surveys, stars and galaxies often overlap\nvisually. Deblending is the task of distinguishing and characterizing\nindividual light sources in survey images. We propose StarNet, a Bayesian\nmethod to deblend sources in astronomical images of crowded star fields.\nStarNet leverages recent advances in variational inference, including amortized\nvariational distributions and an optimization objective targeting an\nexpectation of the forward KL divergence. In our experiments with SDSS images\nof the M2 globular cluster, StarNet is substantially more accurate than two\ncompeting methods: Probabilistic Cataloging (PCAT), a method that uses MCMC for\ninference, and DAOPHOT, a software pipeline employed by SDSS for deblending. In\naddition, the amortized approach to inference gives StarNet the scaling\ncharacteristics necessary to perform Bayesian inference on modern astronomical\nsurveys.\n","authors":["Runjing Liu","Jon D. McAuliffe","Jeffrey Regier"],"pdf_url":"https://arxiv.org/pdf/2102.02409v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14951v1","updated":"2023-08-29T00:44:27Z","published":"2023-08-29T00:44:27Z","title":"Robust Open-Set Spoken Language Identification and the CU MultiLang\n Dataset","summary":" Most state-of-the-art spoken language identification models are closed-set;\nin other words, they can only output a language label from the set of classes\nthey were trained on. Open-set spoken language identification systems, however,\ngain the ability to detect when an input exhibits none of the original\nlanguages. In this paper, we implement a novel approach to open-set spoken\nlanguage identification that uses MFCC and pitch features, a TDNN model to\nextract meaningful feature embeddings, confidence thresholding on softmax\noutputs, and LDA and pLDA for learning to classify new unknown languages. We\npresent a spoken language identification system that achieves 91.76% accuracy\non trained languages and has the capability to adapt to unknown languages on\nthe fly. To that end, we also built the CU MultiLang Dataset, a large and\ndiverse multilingual speech corpus which was used to train and evaluate our\nsystem.\n","authors":["Mustafa Eyceoz","Justin Lee","Siddharth Pittie","Homayoon Beigi"],"pdf_url":"https://arxiv.org/pdf/2308.14951v1.pdf","comment":"6pages, 1 table, 6 figures"},{"id":"http://arxiv.org/abs/2308.14949v1","updated":"2023-08-29T00:25:02Z","published":"2023-08-29T00:25:02Z","title":"Low-bit Quantization for Deep Graph Neural Networks with\n Smoothness-aware Message Propagation","summary":" Graph Neural Network (GNN) training and inference involve significant\nchallenges of scalability with respect to both model sizes and number of\nlayers, resulting in degradation of efficiency and accuracy for large and deep\nGNNs. We present an end-to-end solution that aims to address these challenges\nfor efficient GNNs in resource constrained environments while avoiding the\noversmoothing problem in deep GNNs. We introduce a quantization based approach\nfor all stages of GNNs, from message passing in training to node\nclassification, compressing the model and enabling efficient processing. The\nproposed GNN quantizer learns quantization ranges and reduces the model size\nwith comparable accuracy even under low-bit quantization. To scale with the\nnumber of layers, we devise a message propagation mechanism in training that\ncontrols layer-wise changes of similarities between neighboring nodes. This\nobjective is incorporated into a Lagrangian function with constraints and a\ndifferential multiplier method is utilized to iteratively find optimal\nembeddings. This mitigates oversmoothing and suppresses the quantization error\nto a bound. Significant improvements are demonstrated over state-of-the-art\nquantization methods and deep GNN approaches in both full-precision and\nquantized models. The proposed quantizer demonstrates superior performance in\nINT2 configurations across all stages of GNN, achieving a notable level of\naccuracy. In contrast, existing quantization approaches fail to generate\nsatisfactory accuracy levels. Finally, the inference with INT2 and INT4\nrepresentations exhibits a speedup of 5.11 $\\times$ and 4.70 $\\times$ compared\nto full precision counterparts, respectively.\n","authors":["Shuang Wang","Bahaeddin Eravci","Rustam Guliyev","Hakan Ferhatosmanoglu"],"pdf_url":"https://arxiv.org/pdf/2308.14949v1.pdf","comment":"To appear in CIKM2023"},{"id":"http://arxiv.org/abs/2112.01694v4","updated":"2023-08-29T00:20:32Z","published":"2021-12-03T03:31:08Z","title":"On the Existence of the Adversarial Bayes Classifier (Extended Version)","summary":" Adversarial robustness is a critical property in a variety of modern machine\nlearning applications. While it has been the subject of several recent\ntheoretical studies, many important questions related to adversarial robustness\nare still open. In this work, we study a fundamental question regarding Bayes\noptimality for adversarial robustness. We provide general sufficient conditions\nunder which the existence of a Bayes optimal classifier can be guaranteed for\nadversarial robustness. Our results can provide a useful tool for a subsequent\nstudy of surrogate losses in adversarial robustness and their consistency\nproperties. This manuscript is the extended and corrected version of the paper\n\\emph{On the Existence of the Adversarial Bayes Classifier} published in\nNeurIPS 2021. There were two errors in theorem statements in the original paper\n-- one in the definition of pseudo-certifiable robustness and the other in the\nmeasurability of $A^\\e$ for arbitrary metric spaces. In this version we correct\nthe errors. Furthermore, the results of the original paper did not apply to\nsome non-strictly convex norms and here we extend our results to all possible\nnorms.\n","authors":["Pranjal Awasthi","Natalie S. Frank","Mehryar Mohri"],"pdf_url":"https://arxiv.org/pdf/2112.01694v4.pdf","comment":"27 pages, 3 figures. Version 2: Corrects 2 errors in the paper \"On\n the Existence of the Adversarial Bayes Classifier\" published in NeurIPS.\n Version 3: Update to acknowledgements"},{"id":"http://arxiv.org/abs/2103.10000v5","updated":"2023-08-29T00:09:24Z","published":"2021-03-18T03:24:38Z","title":"Human-Inspired Multi-Agent Navigation using Knowledge Distillation","summary":" Despite significant advancements in the field of multi-agent navigation,\nagents still lack the sophistication and intelligence that humans exhibit in\nmulti-agent settings. In this paper, we propose a framework for learning a\nhuman-like general collision avoidance policy for agent-agent interactions in\nfully decentralized, multi-agent environments. Our approach uses knowledge\ndistillation with reinforcement learning to shape the reward function based on\nexpert policies extracted from human trajectory demonstrations through behavior\ncloning. We show that agents trained with our approach can take human-like\ntrajectories in collision avoidance and goal-directed steering tasks not\nprovided by the demonstrations, outperforming the experts as well as\nlearning-based agents trained without knowledge distillation.\n","authors":["Pei Xu","Ioannis Karamouzas"],"pdf_url":"https://arxiv.org/pdf/2103.10000v5.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n (IROS), 2021"},{"id":"http://arxiv.org/abs/2202.03402v3","updated":"2023-08-29T00:03:00Z","published":"2022-02-07T18:40:38Z","title":"Preserving Privacy and Security in Federated Learning","summary":" Federated learning is known to be vulnerable to both security and privacy\nissues. Existing research has focused either on preventing poisoning attacks\nfrom users or on concealing the local model updates from the server, but not\nboth. However, integrating these two lines of research remains a crucial\nchallenge since they often conflict with one another with respect to the threat\nmodel. In this work, we develop a principle framework that offers both privacy\nguarantees for users and detection against poisoning attacks from them. With a\nnew threat model that includes both an honest-but-curious server and malicious\nusers, we first propose a secure aggregation protocol using homomorphic\nencryption for the server to combine local model updates in a private manner.\nThen, a zero-knowledge proof protocol is leveraged to shift the task of\ndetecting attacks in the local models from the server to the users. The key\nobservation here is that the server no longer needs access to the local models\nfor attack detection. Therefore, our framework enables the central server to\nidentify poisoned model updates without violating the privacy guarantees of\nsecure aggregation.\n","authors":["Truc Nguyen","My T. Thai"],"pdf_url":"https://arxiv.org/pdf/2202.03402v3.pdf","comment":"Published in IEEE/ACM Transactions on Networking"},{"id":"http://arxiv.org/abs/2308.14947v1","updated":"2023-08-29T00:00:18Z","published":"2023-08-29T00:00:18Z","title":"Improving Reinforcement Learning Training Regimes for Social Robot\n Navigation","summary":" In order for autonomous mobile robots to navigate in human spaces, they must\nabide by our social norms. Reinforcement learning (RL) has emerged as an\neffective method to train robot navigation policies that are able to respect\nthese norms. However, a large portion of existing work in the field conducts\nboth RL training and testing in simplistic environments. This limits the\ngeneralization potential of these models to unseen environments, and the\nmeaningfulness of their reported results. We propose a method to improve the\ngeneralization performance of RL social navigation methods using curriculum\nlearning. By employing multiple environment types and by modeling pedestrians\nusing multiple dynamics models, we are able to progressively diversify and\nescalate difficulty in training. Our results show that the use of curriculum\nlearning in training can be used to achieve better generalization performance\nthan previous training methods. We also show that results presented in many\nexisting state-of-the art RL social navigation works do not evaluate their\nmethods outside of their training environments, and thus do not reflect their\npolicies' failure to adequately generalize to out-of-distribution scenarios. In\nresponse, we validate our training approach on larger and more crowded testing\nenvironments than those used in training, allowing for more meaningful\nmeasurements of model performance.\n","authors":["Adam Sigal","Hsiu-Chin Lin","AJung Moon"],"pdf_url":"https://arxiv.org/pdf/2308.14947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.01255v4","updated":"2023-08-29T23:41:36Z","published":"2022-06-02T19:11:27Z","title":"Compressive Fourier collocation methods for high-dimensional diffusion\n equations with periodic boundary conditions","summary":" High-dimensional Partial Differential Equations (PDEs) are a popular\nmathematical modelling tool, with applications ranging from finance to\ncomputational chemistry. However, standard numerical techniques for solving\nthese PDEs are typically affected by the curse of dimensionality. In this work,\nwe tackle this challenge while focusing on stationary diffusion equations\ndefined over a high-dimensional domain with periodic boundary conditions.\nInspired by recent progress in sparse function approximation in high\ndimensions, we propose a new method called compressive Fourier collocation.\nCombining ideas from compressive sensing and spectral collocation, our method\nreplaces the use of structured collocation grids with Monte Carlo sampling and\nemploys sparse recovery techniques, such as orthogonal matching pursuit and\n$\\ell^1$ minimization, to approximate the Fourier coefficients of the PDE\nsolution. We conduct a rigorous theoretical analysis showing that the\napproximation error of the proposed method is comparable with the best $s$-term\napproximation (with respect to the Fourier basis) to the solution. Using the\nrecently introduced framework of random sampling in bounded Riesz systems, our\nanalysis shows that the compressive Fourier collocation method mitigates the\ncurse of dimensionality with respect to the number of collocation points under\nsufficient conditions on the regularity of the diffusion coefficient. We also\npresent numerical experiments that illustrate the accuracy and stability of the\nmethod for the approximation of sparse and compressible solutions.\n","authors":["Weiqi Wang","Simone Brugiapaglia"],"pdf_url":"https://arxiv.org/pdf/2206.01255v4.pdf","comment":"33 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.15667v1","updated":"2023-08-29T23:35:36Z","published":"2023-08-29T23:35:36Z","title":"Bridging Distribution Learning and Image Clustering in High-dimensional\n Space","summary":" Distribution learning focuses on learning the probability density function\nfrom a set of data samples. In contrast, clustering aims to group similar\nobjects together in an unsupervised manner. Usually, these two tasks are\nconsidered unrelated. However, the relationship between the two may be\nindirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge.\nIn this paper, we focus on exploring the correlation between distribution\nlearning and clustering, with the motivation to fill the gap between these two\nfields, utilizing an autoencoder (AE) to encode images into a high-dimensional\nlatent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler\n(KL) divergence loss are used to fit the Gaussian components of the GMM and\nlearn the data distribution. Finally, image clustering is achieved through each\nGaussian component of GMM. Yet, the \"curse of dimensionality\" poses severe\nchallenges for most clustering algorithms. Compared with the classic\nExpectation-Maximization (EM) Algorithm, experimental results show that MCMarg\nand KL divergence can greatly alleviate the difficulty. Based on the\nexperimental results, we believe distribution learning can exploit the\npotential of GMM in image clustering within high-dimensional space.\n","authors":["Guanfang Dong","Chenqiu Zhao","Anup Basu"],"pdf_url":"https://arxiv.org/pdf/2308.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12315v2","updated":"2023-08-29T23:19:53Z","published":"2023-08-23T08:38:54Z","title":"Trustworthy Representation Learning Across Domains","summary":" As AI systems have obtained significant performance to be deployed widely in\nour daily live and human society, people both enjoy the benefits brought by\nthese technologies and suffer many social issues induced by these systems. To\nmake AI systems good enough and trustworthy, plenty of researches have been\ndone to build guidelines for trustworthy AI systems. Machine learning is one of\nthe most important parts for AI systems and representation learning is the\nfundamental technology in machine learning. How to make the representation\nlearning trustworthy in real-world application, e.g., cross domain scenarios,\nis very valuable and necessary for both machine learning and AI system fields.\nInspired by the concepts in trustworthy AI, we proposed the first trustworthy\nrepresentation learning across domains framework which includes four concepts,\ni.e, robustness, privacy, fairness, and explainability, to give a comprehensive\nliterature review on this research direction. Specifically, we first introduce\nthe details of the proposed trustworthy framework for representation learning\nacross domains. Second, we provide basic notions and comprehensively summarize\nexisting methods for the trustworthy framework from four concepts. Finally, we\nconclude this survey with insights and discussions on future research\ndirections.\n","authors":["Ronghang Zhu","Dongliang Guo","Daiqing Qi","Zhixuan Chu","Xiang Yu","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.12315v2.pdf","comment":"38 pages, 15 figures"},{"id":"http://arxiv.org/abs/2303.08112v3","updated":"2023-08-29T22:55:27Z","published":"2023-03-14T17:47:09Z","title":"Eliciting Latent Predictions from Transformers with the Tuned Lens","summary":" We analyze transformers from the perspective of iterative inference, seeking\nto understand how model predictions are refined layer by layer. To do so, we\ntrain an affine probe for each block in a frozen pretrained model, making it\npossible to decode every hidden state into a distribution over the vocabulary.\nOur method, the tuned lens, is a refinement of the earlier \"logit lens\"\ntechnique, which yielded useful insights but is often brittle.\n We test our method on various autoregressive language models with up to 20B\nparameters, showing it to be more predictive, reliable and unbiased than the\nlogit lens. With causal experiments, we show the tuned lens uses similar\nfeatures to the model itself. We also find the trajectory of latent predictions\ncan be used to detect malicious inputs with high accuracy. All code needed to\nreproduce our results can be found at\nhttps://github.com/AlignmentResearch/tuned-lens.\n","authors":["Nora Belrose","Zach Furman","Logan Smith","Danny Halawi","Igor Ostrovsky","Lev McKinney","Stella Biderman","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2303.08112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09929v3","updated":"2023-08-29T22:52:28Z","published":"2022-11-17T23:01:47Z","title":"Contrastive Credibility Propagation for Reliable Semi-Supervised\n Learning","summary":" Producing labels for unlabeled data is error-prone, making semi-supervised\nlearning (SSL) troublesome. Often, little is known about when and why an\nalgorithm fails to outperform a supervised baseline. Using benchmark datasets,\nwe craft five common real-world SSL data scenarios: few-label, open-set,\nnoisy-label, and class distribution imbalance/misalignment in the labeled and\nunlabeled sets. We propose a novel algorithm called Contrastive Credibility\nPropagation (CCP) for deep SSL via iterative transductive pseudo-label\nrefinement. CCP unifies semi-supervised learning and noisy label learning for\nthe goal of reliably outperforming a supervised baseline in any data scenario.\nCompared to prior methods which focus on a subset of scenarios, CCP uniquely\noutperforms the supervised baseline in all scenarios, supporting practitioners\nwhen the qualities of labeled or unlabeled data are unknown.\n","authors":["Brody Kutt","Pralay Ramteke","Xavier Mignot","Pamela Toman","Nandini Ramanan","Sujit Rokka Chhetri","Shan Huang","Min Du","William Hewlett"],"pdf_url":"https://arxiv.org/pdf/2211.09929v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06074v2","updated":"2023-08-29T22:30:15Z","published":"2022-12-12T17:41:32Z","title":"Regression with Label Differential Privacy","summary":" We study the task of training regression models with the guarantee of label\ndifferential privacy (DP). Based on a global prior distribution on label\nvalues, which could be obtained privately, we derive a label DP randomization\nmechanism that is optimal under a given regression loss function. We prove that\nthe optimal mechanism takes the form of a \"randomized response on bins\", and\npropose an efficient algorithm for finding the optimal bin values. We carry out\na thorough experimental evaluation on several datasets demonstrating the\nefficacy of our algorithm.\n","authors":["Badih Ghazi","Pritish Kamath","Ravi Kumar","Ethan Leeman","Pasin Manurangsi","Avinash Varadarajan","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.06074v2.pdf","comment":"Appeared at ICLR '23, 28 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.15656v1","updated":"2023-08-29T22:23:52Z","published":"2023-08-29T22:23:52Z","title":"Deep Reinforcement Learning Based Framework for Mobile Energy\n Disseminator Dispatching to Charge On-the-Road Electric Vehicles","summary":" The exponential growth of electric vehicles (EVs) presents novel challenges\nin preserving battery health and in addressing the persistent problem of\nvehicle range anxiety. To address these concerns, wireless charging,\nparticularly, Mobile Energy Disseminators (MEDs) have emerged as a promising\nsolution. The MED is mounted behind a large vehicle and charges all\nparticipating EVs within a radius upstream of it. Unfortuantely, during such\nV2V charging, the MED and EVs inadvertently form platoons, thereby occupying\nmultiple lanes and impairing overall corridor travel efficiency. In addition,\nconstrained budgets for MED deployment necessitate the development of an\neffective dispatching strategy to determine optimal timing and locations for\nintroducing the MEDs into traffic. This paper proposes a deep reinforcement\nlearning (DRL) based methodology to develop a vehicle dispatching framework. In\nthe first component of the framework, we develop a realistic reinforcement\nlearning environment termed \"ChargingEnv\" which incorporates a reliable\ncharging simulation system that accounts for common practical issues in\nwireless charging deployment, specifically, the charging panel misalignment.\nThe second component, the Proximal-Policy Optimization (PPO) agent, is trained\nto control MED dispatching through continuous interactions with ChargingEnv.\nNumerical experiments were carried out to demonstrate the demonstrate the\nefficacy of the proposed MED deployment decision processor. The experiment\nresults suggest that the proposed model can significantly enhance EV travel\nrange while efficiently deploying a optimal number of MEDs. The proposed model\nis found to be not only practical in its applicability but also has promises of\nreal-world effectiveness. The proposed model can help travelers to maximize EV\nrange and help road agencies or private-sector vendors to manage the deployment\nof MEDs efficiently.\n","authors":["Jiaming Wang","Jiqian Dong","Sikai Chen","Shreyas Sundaram","Samuel Labi"],"pdf_url":"https://arxiv.org/pdf/2308.15656v1.pdf","comment":"Submitted for presentation only at the 2024 Annual Meeting of the\n Transportation Research Board"},{"id":"http://arxiv.org/abs/1904.08576v5","updated":"2023-08-29T22:17:05Z","published":"2019-04-18T02:56:00Z","title":"On Low-rank Trace Regression under General Sampling Distribution","summary":" In this paper, we study the trace regression when a matrix of parameters B*\nis estimated via the convex relaxation of a rank-regularized regression or via\nregularized non-convex optimization. It is known that these estimators satisfy\nnear-optimal error bounds under assumptions on the rank, coherence, and\nspikiness of B*. We start by introducing a general notion of spikiness for B*\nthat provides a generic recipe to prove the restricted strong convexity of the\nsampling operator of the trace regression and obtain near-optimal and\nnon-asymptotic error bounds for the estimation error. Similar to the existing\nliterature, these results require the regularization parameter to be above a\ncertain theory-inspired threshold that depends on observation noise that may be\nunknown in practice. Next, we extend the error bounds to cases where the\nregularization parameter is chosen via cross-validation. This result is\nsignificant in that existing theoretical results on cross-validated estimators\n(Kale et al., 2011; Kumar et al., 2013; Abou-Moustafa and Szepesvari, 2017) do\nnot apply to our setting since the estimators we study are not known to satisfy\ntheir required notion of stability. Finally, using simulations on synthetic and\nreal data, we show that the cross-validated estimator selects a near-optimal\npenalty parameter and outperforms the theory-inspired approach of selecting the\nparameter.\n","authors":["Nima Hamidi","Mohsen Bayati"],"pdf_url":"https://arxiv.org/pdf/1904.08576v5.pdf","comment":"49 pages, 6 figure2"},{"id":"http://arxiv.org/abs/2305.02422v3","updated":"2023-08-29T22:12:04Z","published":"2023-05-03T20:29:04Z","title":"GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content","summary":" The mobile cloud gaming industry has been rapidly growing over the last\ndecade. When streaming gaming videos are transmitted to customers' client\ndevices from cloud servers, algorithms that can monitor distorted video quality\nwithout having any reference video available are desirable tools. However,\ncreating No-Reference Video Quality Assessment (NR VQA) models that can\naccurately predict the quality of streaming gaming videos rendered by computer\ngraphics engines is a challenging problem, since gaming content generally\ndiffers statistically from naturalistic videos, often lacks detail, and\ncontains many smooth regions. Until recently, the problem has been further\ncomplicated by the lack of adequate subjective quality databases of mobile\ngaming content. We have created a new gaming-specific NR VQA model called the\nGaming Video Quality Evaluator (GAMIVAL), which combines and leverages the\nadvantages of spatial and temporal gaming distorted scene statistics models, a\nneural noise model, and deep semantic features. Using a support vector\nregression (SVR) as a regressor, GAMIVAL achieves superior performance on the\nnew LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database.\n","authors":["Yu-Chih Chen","Avinab Saha","Chase Davis","Bo Qiu","Xiaoming Wang","Rahul Gowda","Ioannis Katsavounidis","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2305.02422v3.pdf","comment":"Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been\n made available online: https://github.com/lskdream/GAMIVAL"},{"id":"http://arxiv.org/abs/2308.15651v1","updated":"2023-08-29T22:03:17Z","published":"2023-08-29T22:03:17Z","title":"Ensuring User-side Fairness in Dynamic Recommender Systems","summary":" User-side group fairness is crucial for modern recommender systems, as it\naims to alleviate performance disparity between groups of users defined by\nsensitive attributes such as gender, race, or age. We find that the disparity\ntends to persist or even increase over time. This calls for effective ways to\naddress user-side fairness in a dynamic environment, which has been\ninfrequently explored in the literature. However, fairness-constrained\nre-ranking, a typical method to ensure user-side fairness (i.e., reducing\nperformance disparity), faces two fundamental challenges in the dynamic\nsetting: (1) non-differentiability of the ranking-based fairness constraint,\nwhich hinders the end-to-end training paradigm, and (2) time-inefficiency,\nwhich impedes quick adaptation to changes in user preferences. In this paper,\nwe propose FAir Dynamic rEcommender (FADE), an end-to-end framework with\nfine-tuning strategy to dynamically alleviate performance disparity. To tackle\nthe above challenges, FADE uses a novel fairness loss designed to be\ndifferentiable and lightweight to fine-tune model parameters to ensure both\nuser-side fairness and high-quality recommendations. Via extensive experiments\non the real-world dataset, we empirically demonstrate that FADE effectively and\nefficiently reduces performance disparity, and furthermore, FADE improves\noverall recommendation quality over time compared to not using any new data.\n","authors":["Hyunsik Yoo","Zhichen Zeng","Jian Kang","Zhining Liu","David Zhou","Fei Wang","Eunice Chan","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2308.15651v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.08303v3","updated":"2023-08-29T21:52:58Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n Models","summary":" Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v3.pdf","comment":"fix typos"},{"id":"http://arxiv.org/abs/2308.15647v1","updated":"2023-08-29T21:49:28Z","published":"2023-08-29T21:49:28Z","title":"A General Recipe for Automated Machine Learning in Practice","summary":" Automated Machine Learning (AutoML) is an area of research that focuses on\ndeveloping methods to generate machine learning models automatically. The idea\nof being able to build machine learning models with very little human\nintervention represents a great opportunity for the practice of applied machine\nlearning. However, there is very little information on how to design an AutoML\nsystem in practice. Most of the research focuses on the problems facing\noptimization algorithms and leaves out the details of how that would be done in\npractice. In this paper, we propose a frame of reference for building general\nAutoML systems. Through a narrative review of the main approaches in the area,\nour main idea is to distill the fundamental concepts in order to support them\nin a single design. Finally, we discuss some open problems related to the\napplication of AutoML for future research.\n","authors":["Hernan Ceferino Vazquez"],"pdf_url":"https://arxiv.org/pdf/2308.15647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15642v1","updated":"2023-08-29T21:27:21Z","published":"2023-08-29T21:27:21Z","title":"Clustering Without an Eigengap","summary":" We study graph clustering in the Stochastic Block Model (SBM) in the presence\nof both large clusters and small, unrecoverable clusters. Previous approaches\nachieving exact recovery do not allow any small clusters of size $o(\\sqrt{n})$,\nor require a size gap between the smallest recovered cluster and the largest\nnon-recovered cluster. We provide an algorithm based on semidefinite\nprogramming (SDP) which removes these requirements and provably recovers large\nclusters regardless of the remaining cluster sizes. Mid-sized clusters pose\nunique challenges to the analysis, since their proximity to the recovery\nthreshold makes them highly sensitive to small noise perturbations and\nprecludes a closed-form candidate solution. We develop novel techniques,\nincluding a leave-one-out-style argument which controls the correlation between\nSDP solutions and noise vectors even when the removal of one row of noise can\ndrastically change the SDP solution. We also develop improved eigenvalue\nperturbation bounds of potential independent interest. Using our gap-free\nclustering procedure, we obtain efficient algorithms for the problem of\nclustering with a faulty oracle with superior query complexities, notably\nachieving $o(n^2)$ sample complexity even in the presence of a large number of\nsmall clusters. Our gap-free clustering procedure also leads to improved\nalgorithms for recursive clustering. Our results extend to certain\nheterogeneous probability settings that are challenging for alternative\nalgorithms.\n","authors":["Matthew Zurek","Yudong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15642v1.pdf","comment":"68 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.15640v1","updated":"2023-08-29T21:25:24Z","published":"2023-08-29T21:25:24Z","title":"Identifying Constitutive Parameters for Complex Hyperelastic Solids\n using Physics-Informed Neural Networks","summary":" Identifying constitutive parameters in engineering and biological materials,\nparticularly those with intricate geometries and mechanical behaviors, remains\na longstanding challenge. The recent advent of Physics-Informed Neural Networks\n(PINNs) offers promising solutions, but current frameworks are often limited to\nbasic constitutive laws and encounter practical constraints when combined with\nexperimental data. In this paper, we introduce a new PINN-based framework\ndesigned to identify material parameters for soft materials, specifically those\nexhibiting complex constitutive behaviors, under large deformation in plane\nstress conditions. Distinctively, our model emphasizes training PINNs with\nmulti-modal time-dependent experimental datasets consisting of full-field\ndeformation and loading history, ensuring algorithm robustness even amidst\nnoisy data. Our results reveal that our framework can accurately identify\nconstitutive parameters of the incompressible Arruda-Boyce model for samples\nwith intricate geometries, maintaining an error below 5%, even with an\nexperimental noise level of 5%. We believe our framework sets the stage for a\ntransformative approach in modulus identification for complex solids,\nespecially for those with geometrical and constitutive intricate.\n","authors":["Siyuan Song","Hanxun Jin"],"pdf_url":"https://arxiv.org/pdf/2308.15640v1.pdf","comment":"31 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.09884v2","updated":"2023-08-29T21:23:03Z","published":"2023-08-19T02:30:35Z","title":"A Transformer-based Framework For Multi-variate Time Series: A Remaining\n Useful Life Prediction Use Case","summary":" In recent times, Large Language Models (LLMs) have captured a global\nspotlight and revolutionized the field of Natural Language Processing. One of\nthe factors attributed to the effectiveness of LLMs is the model architecture\nused for training, transformers. Transformer models excel at capturing\ncontextual features in sequential data since time series data are sequential,\ntransformer models can be leveraged for more efficient time series data\nprediction. The field of prognostics is vital to system health management and\nproper maintenance planning. A reliable estimation of the remaining useful life\n(RUL) of machines holds the potential for substantial cost savings. This\nincludes avoiding abrupt machine failures, maximizing equipment usage, and\nserving as a decision support system (DSS). This work proposed an\nencoder-transformer architecture-based framework for multivariate time series\nprediction for a prognostics use case. We validated the effectiveness of the\nproposed framework on all four sets of the C-MAPPS benchmark dataset for the\nremaining useful life prediction task. To effectively transfer the knowledge\nand application of transformers from the natural language domain to time\nseries, three model-specific experiments were conducted. Also, to enable the\nmodel awareness of the initial stages of the machine life and its degradation\npath, a novel expanding window method was proposed for the first time in this\nwork, it was compared with the sliding window method, and it led to a large\nimprovement in the performance of the encoder transformer model. Finally, the\nperformance of the proposed encoder-transformer model was evaluated on the test\ndataset and compared with the results from 13 other state-of-the-art (SOTA)\nmodels in the literature and it outperformed them all with an average\nperformance increase of 137.65% over the next best model across all the\ndatasets.\n","authors":["Oluwaseyi Ogunfowora","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2308.09884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15639v1","updated":"2023-08-29T21:20:16Z","published":"2023-08-29T21:20:16Z","title":"Hyperbolic Convolutional Neural Networks","summary":" Deep Learning is mostly responsible for the surge of interest in Artificial\nIntelligence in the last decade. So far, deep learning researchers have been\nparticularly successful in the domain of image processing, where Convolutional\nNeural Networks are used. Although excelling at image classification,\nConvolutional Neural Networks are quite naive in that no inductive bias is set\non the embedding space for images. Similar flaws are also exhibited by another\ntype of Convolutional Networks - Graph Convolutional Neural Networks. However,\nusing non-Euclidean space for embedding data might result in more robust and\nexplainable models. One example of such a non-Euclidean space is hyperbolic\nspace. Hyperbolic spaces are particularly useful due to their ability to fit\nmore data in a low-dimensional space and tree-likeliness properties. These\nattractive properties have been previously used in multiple papers which\nindicated that they are beneficial for building hierarchical embeddings using\nshallow models and, recently, using MLPs and RNNs.\n However, no papers have yet suggested a general approach to using Hyperbolic\nConvolutional Neural Networks for structured data processing, although these\nare the most common examples of data used. Therefore, the goal of this work is\nto devise a general recipe for building Hyperbolic Convolutional Neural\nNetworks. We hypothesize that ability of hyperbolic space to capture hierarchy\nin the data would lead to better performance. This ability should be\nparticularly useful in cases where data has a tree-like structure. Since this\nis the case for many existing datasets \\citep{wordnet, imagenet, fb15k}, we\nargue that such a model would be advantageous both in terms of applications and\nfuture research prospects.\n","authors":["Andrii Skliar","Maurice Weiler"],"pdf_url":"https://arxiv.org/pdf/2308.15639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09175v2","updated":"2023-08-29T20:33:12Z","published":"2023-08-17T20:27:33Z","title":"Diversifying AI: Towards Creative Chess with AlphaZero","summary":" In recent years, Artificial Intelligence (AI) systems have surpassed human\nintelligence in a variety of computational tasks. However, AI systems, like\nhumans, make mistakes, have blind spots, hallucinate, and struggle to\ngeneralize to new situations. This work explores whether AI can benefit from\ncreative decision-making mechanisms when pushed to the limits of its\ncomputational rationality. In particular, we investigate whether a team of\ndiverse AI systems can outperform a single AI in challenging tasks by\ngenerating more ideas as a group and then selecting the best ones. We study\nthis question in the game of chess, the so-called drosophila of AI. We build on\nAlphaZero (AZ) and extend it to represent a league of agents via a\nlatent-conditioned architecture, which we call AZ_db. We train AZ_db to\ngenerate a wider range of ideas using behavioral diversity techniques and\nselect the most promising ones with sub-additive planning. Our experiments\nsuggest that AZ_db plays chess in diverse ways, solves more puzzles as a group\nand outperforms a more homogeneous team. Notably, AZ_db solves twice as many\nchallenging puzzles as AZ, including the challenging Penrose positions. When\nplaying chess from different openings, we notice that players in AZ_db\nspecialize in different openings, and that selecting a player for each opening\nusing sub-additive planning results in a 50 Elo improvement over AZ. Our\nfindings suggest that diversity bonuses emerge in teams of AI agents, just as\nthey do in teams of humans and that diversity is a valuable asset in solving\ncomputationally hard problems.\n","authors":["Tom Zahavy","Vivek Veeriah","Shaobo Hou","Kevin Waugh","Matthew Lai","Edouard Leurent","Nenad Tomasev","Lisa Schut","Demis Hassabis","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2308.09175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15618v1","updated":"2023-08-29T20:25:49Z","published":"2023-08-29T20:25:49Z","title":"RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware\n Contextual Reasoning on Whole Slide Images","summary":" Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer\nin the US. It is diagnosed by manual multi-class tumor grading using a tissue\nwhole slide image (WSI), which is subjective and suffers from inter-pathologist\nvariability. We propose an automated weakly-supervised grading approach for\ncSCC WSIs that is trained using WSI-level grade and does not require\nfine-grained tumor annotations. The proposed model, RACR-MIL, transforms each\nWSI into a bag of tiled patches and leverages attention-based multiple-instance\nlearning to assign a WSI-level grade. We propose three key innovations to\naddress general as well as cSCC-specific challenges in tumor grading. First, we\nleverage spatial and semantic proximity to define a WSI graph that encodes both\nlocal and non-local dependencies between tumor regions and leverage graph\nattention convolution to derive contextual patch features. Second, we introduce\na novel ordinal ranking constraint on the patch attention network to ensure\nthat higher-grade tumor regions are assigned higher attention. Third, we use\ntumor depth as an auxiliary task to improve grade classification in a multitask\nlearning framework. RACR-MIL achieves 2-9% improvement in grade classification\nover existing weakly-supervised approaches on a dataset of 718 cSCC tissue\nimages and localizes the tumor better. The model achieves 5-20% higher accuracy\nin difficult-to-classify high-risk grade classes and is robust to class\nimbalance.\n","authors":["Anirudh Choudhary","Angelina Hwang","Jacob Kechter","Krishnakant Saboo","Blake Bordeaux","Puneet Bhullar","Nneka Comfere","David DiCaudo","Steven Nelson","Emma Johnson","Leah Swanson","Dennis Murphree","Aaron Mangold","Ravishankar K. Iyer"],"pdf_url":"https://arxiv.org/pdf/2308.15618v1.pdf","comment":"7 pages main text, 2 page references, 3 page appendix; submitted to\n AAAI"},{"id":"http://arxiv.org/abs/2308.15614v1","updated":"2023-08-29T20:14:42Z","published":"2023-08-29T20:14:42Z","title":"Everything Perturbed All at Once: Enabling Differentiable Graph Attacks","summary":" As powerful tools for representation learning on graphs, graph neural\nnetworks (GNNs) have played an important role in applications including social\nnetworks, recommendation systems, and online web services. However, GNNs have\nbeen shown to be vulnerable to adversarial attacks, which can significantly\ndegrade their effectiveness. Recent state-of-the-art approaches in adversarial\nattacks rely on gradient-based meta-learning to selectively perturb a single\nedge with the highest attack score until they reach the budget constraint.\nWhile effective in identifying vulnerable links, these methods are plagued by\nhigh computational costs. By leveraging continuous relaxation and\nparameterization of the graph structure, we propose a novel attack method\ncalled Differentiable Graph Attack (DGA) to efficiently generate effective\nattacks and meanwhile eliminate the need for costly retraining. Compared to the\nstate-of-the-art, DGA achieves nearly equivalent attack performance with 6\ntimes less training time and 11 times smaller GPU memory footprint on different\nbenchmark datasets. Additionally, we provide extensive experimental analyses of\nthe transferability of the DGA among different graph models, as well as its\nrobustness against widely-used defense mechanisms.\n","authors":["Haoran Liu","Bokun Wang","Jianling Wang","Xiangjue Dong","Tianbao Yang","James Caverlee"],"pdf_url":"https://arxiv.org/pdf/2308.15614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15613v1","updated":"2023-08-29T20:13:37Z","published":"2023-08-29T20:13:37Z","title":"Mixed Variational Flows for Discrete Variables","summary":" Variational flows allow practitioners to learn complex continuous\ndistributions, but approximating discrete distributions remains a challenge.\nCurrent methodologies typically embed the discrete target in a continuous space\n- usually via continuous relaxation or dequantization - and then apply a\ncontinuous flow. These approaches involve a surrogate target that may not\ncapture the original discrete target, might have biased or unstable gradients,\nand can create a difficult optimization problem. In this work, we develop a\nvariational flow family for discrete distributions without any continuous\nembedding. First, we develop a measure-preserving and discrete (MAD) invertible\nmap that leaves the discrete target invariant, and then create a mixed\nvariational flow (MAD Mix) based on that map. We also develop an extension to\nMAD Mix that handles joint discrete and continuous models. Our experiments\nsuggest that MAD Mix produces more reliable approximations than\ncontinuous-embedding flows while being significantly faster to train.\n","authors":["Gian Carlo Diluvi","Benjamin Bloem-Reddy","Trevor Campbell"],"pdf_url":"https://arxiv.org/pdf/2308.15613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15609v1","updated":"2023-08-29T20:02:24Z","published":"2023-08-29T20:02:24Z","title":"InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning","summary":" One-Shot Neural Architecture Search (NAS) algorithms often rely on training a\nhardware agnostic super-network for a domain specific task. Optimal\nsub-networks are then extracted from the trained super-network for different\nhardware platforms. However, training super-networks from scratch can be\nextremely time consuming and compute intensive especially for large models that\nrely on a two-stage training process of pre-training and fine-tuning. State of\nthe art pre-trained models are available for a wide range of tasks, but their\nlarge sizes significantly limits their applicability on various hardware\nplatforms. We propose InstaTune, a method that leverages off-the-shelf\npre-trained weights for large models and generates a super-network during the\nfine-tuning stage. InstaTune has multiple benefits. Firstly, since the process\nhappens during fine-tuning, it minimizes the overall time and compute resources\nrequired for NAS. Secondly, the sub-networks extracted are optimized for the\ntarget task, unlike prior work that optimizes on the pre-training objective.\nFinally, InstaTune is easy to \"plug and play\" in existing frameworks. By using\nmulti-objective evolutionary search algorithms along with lightly trained\npredictors, we find Pareto-optimal sub-networks that outperform their\nrespective baselines across different performance objectives such as accuracy\nand MACs. Specifically, we demonstrate that our approach performs well across\nboth unimodal (ViT and BERT) and multi-modal (BEiT-3) transformer based\narchitectures.\n","authors":["Sharath Nittur Sridhar","Souvik Kundu","Sairam Sundaresan","Maciej Szankin","Anthony Sarah"],"pdf_url":"https://arxiv.org/pdf/2308.15609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15605v1","updated":"2023-08-29T19:54:37Z","published":"2023-08-29T19:54:37Z","title":"Measurement Tampering Detection Benchmark","summary":" When training powerful AI systems to perform complex tasks, it may be\nchallenging to provide training signals which are robust to optimization. One\nconcern is measurement tampering, where the AI system manipulates multiple\nmeasurements to create the illusion of good results instead of achieving the\ndesired outcome. In this work, we build four new text-based datasets to\nevaluate measurement tampering detection techniques on large language models.\nConcretely, given sets of text inputs and measurements aimed at determining if\nsome outcome occurred, as well as a base model able to accurately predict\nmeasurements, the goal is to determine if examples where all measurements\nindicate the outcome actually had the outcome occur, or if this was caused by\nmeasurement tampering. We demonstrate techniques that outperform simple\nbaselines on most datasets, but don't achieve maximum performance. We believe\nthere is significant room for improvement for both techniques and datasets, and\nwe are excited for future work tackling measurement tampering.\n","authors":["Fabien Roger","Ryan Greenblatt","Max Nadeau","Buck Shlegeris","Nate Thomas"],"pdf_url":"https://arxiv.org/pdf/2308.15605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14935v2","updated":"2023-08-29T19:52:02Z","published":"2022-11-27T21:00:31Z","title":"RecXplainer: Amortized Attribute-based Personalized Explanations for\n Recommender Systems","summary":" Recommender systems influence many of our interactions in the digital world\n-- impacting how we shop for clothes, sorting what we see when browsing YouTube\nor TikTok, and determining which restaurants and hotels we are shown when using\nhospitality platforms. Modern recommender systems are large, opaque models\ntrained on a mixture of proprietary and open-source datasets. Naturally, issues\nof trust arise on both the developer and user side: is the system working\ncorrectly, and why did a user receive (or not receive) a particular\nrecommendation? Providing an explanation alongside a recommendation alleviates\nsome of these concerns. The status quo for auxiliary recommender system\nfeedback is either user-specific explanations (e.g., \"users who bought item B\nalso bought item A\") or item-specific explanations (e.g., \"we are recommending\nitem A because you watched/bought item B\"). However, users bring personalized\ncontext into their search experience, valuing an item as a function of that\nitem's attributes and their own personal preferences. In this work, we propose\nRecXplainer, a novel method for generating fine-grained explanations based on a\nuser's preferences over the attributes of recommended items. We evaluate\nRecXplainer on five real-world and large-scale recommendation datasets using\nfive different kinds of recommender systems to demonstrate the efficacy of\nRecXplainer in capturing users' preferences over item attributes and using them\nto explain recommendations. We also compare RecXplainer to five baselines and\nshow RecXplainer's exceptional performance on ten metrics.\n","authors":["Sahil Verma","Chirag Shah","John P. Dickerson","Anurag Beniwal","Narayanan Sadagopan","Arjun Seshadri"],"pdf_url":"https://arxiv.org/pdf/2211.14935v2.pdf","comment":"Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022"},{"id":"http://arxiv.org/abs/2308.15602v1","updated":"2023-08-29T19:47:31Z","published":"2023-08-29T19:47:31Z","title":"An Experimental Comparison of Partitioning Strategies for Distributed\n Graph Neural Network Training","summary":" Recently, graph neural networks (GNNs) have gained much attention as a\ngrowing area of deep learning capable of learning on graph-structured data.\nHowever, the computational and memory requirements for training GNNs on\nlarge-scale graphs can exceed the capabilities of single machines or GPUs,\nmaking distributed GNN training a promising direction for large-scale GNN\ntraining. A prerequisite for distributed GNN training is to partition the input\ngraph into smaller parts that are distributed among multiple machines of a\ncompute cluster. Although graph partitioning has been extensively studied with\nregard to graph analytics and graph databases, its effect on GNN training\nperformance is largely unexplored.\n In this paper, we study the effectiveness of graph partitioning for\ndistributed GNN training. Our study aims to understand how different factors\nsuch as GNN parameters, mini-batch size, graph type, features size, and\nscale-out factor influence the effectiveness of graph partitioning. We conduct\nexperiments with two different GNN systems using vertex and edge partitioning.\nWe found that graph partitioning is a crucial pre-processing step that can\nheavily reduce the training time and memory footprint. Furthermore, our results\nshow that invested partitioning time can be amortized by reduced GNN training,\nmaking it a relevant optimization.\n","authors":["Nikolai Merkel","Daniel Stoll","Ruben Mayer","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2308.15602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15594v1","updated":"2023-08-29T19:38:41Z","published":"2023-08-29T19:38:41Z","title":"Can transformers learn the greatest common divisor?","summary":" I investigate the capability of small transformers to compute the greatest\ncommon divisor (GCD) of two positive integers. When the training distribution\nand the representation base are carefully chosen, models achieve 98% accuracy\nand correctly predict 91 of the 100 first GCD. Model predictions are\ndeterministic and fully interpretable. During training, the models learn to\ncluster input pairs with the same GCD, and classify them by their divisors.\nBasic models, trained from uniform operands encoded on small bases, only\ncompute a handful of GCD (up to 38 out of 100): the products of divisors of the\nbase. Longer training and larger bases allow some models to \"grok\" small prime\nGCD. Training from log-uniform operands boosts performance to 73 correct GCD,\nand balancing the training distribution of GCD, from inverse square to\nlog-uniform, to 91 GCD. Training models from a uniform distribution of GCD\nbreaks the deterministic model behavior.\n","authors":["François Charton"],"pdf_url":"https://arxiv.org/pdf/2308.15594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.12871v3","updated":"2023-08-29T19:21:50Z","published":"2022-09-26T17:39:53Z","title":"Variationally Mimetic Operator Networks","summary":" In recent years operator networks have emerged as promising deep learning\ntools for approximating the solution to partial differential equations (PDEs).\nThese networks map input functions that describe material properties, forcing\nfunctions and boundary data to the solution of a PDE. This work describes a new\narchitecture for operator networks that mimics the form of the numerical\nsolution obtained from an approximate variational or weak formulation of the\nproblem. The application of these ideas to a generic elliptic PDE leads to a\nvariationally mimetic operator network (VarMiON). Like the conventional Deep\nOperator Network (DeepONet) the VarMiON is also composed of a sub-network that\nconstructs the basis functions for the output and another that constructs the\ncoefficients for these basis functions. However, in contrast to the DeepONet,\nthe architecture of these sub-networks in the VarMiON is precisely determined.\nAn analysis of the error in the VarMiON solution reveals that it contains\ncontributions from the error in the training data, the training error, the\nquadrature error in sampling input and output functions, and a \"covering error\"\nthat measures the distance between the test input functions and the nearest\nfunctions in the training dataset. It also depends on the stability constants\nfor the exact solution operator and its VarMiON approximation. The application\nof the VarMiON to a canonical elliptic PDE and a nonlinear PDE reveals that for\napproximately the same number of network parameters, on average the VarMiON\nincurs smaller errors than a standard DeepONet and a recently proposed\nmultiple-input operator network (MIONet). Further, its performance is more\nrobust to variations in input functions, the techniques used to sample the\ninput and output functions, the techniques used to construct the basis\nfunctions, and the number of input functions.\n","authors":["Dhruv Patel","Deep Ray","Michael R. A. Abdelmalik","Thomas J. R. Hughes","Assad A. Oberai"],"pdf_url":"https://arxiv.org/pdf/2209.12871v3.pdf","comment":"49 pages, 18 figures, 1 Appendix"},{"id":"http://arxiv.org/abs/2308.15575v1","updated":"2023-08-29T19:04:42Z","published":"2023-08-29T19:04:42Z","title":"Prototype Fission: Closing Set for Robust Open-set Semi-supervised\n Learning","summary":" Semi-supervised Learning (SSL) has been proven vulnerable to\nout-of-distribution (OOD) samples in realistic large-scale unsupervised\ndatasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A\nkey underlying problem is class-wise latent space spreading from closed seen\nspace to open unseen space, and the bias is further magnified in SSL's\nself-training loops. To close the ID distribution set so that OODs are better\nrejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise\nlatent spaces into compact sub-spaces by automatic fine-grained latent space\nmining, driven by coarse-grained labels only. Specifically, we form multiple\nunique learnable sub-class prototypes for each class, optimized towards both\ndiversity and consistency. The Diversity Modeling term encourages samples to be\nclustered by one of the multiple sub-class prototypes, while the Consistency\nModeling term clusters all samples of the same class to a global prototype.\nInstead of \"opening set\", i.e., modeling OOD distribution, Prototype Fission\n\"closes set\" and makes it hard for OOD samples to fit in sub-class latent\nspace. Therefore, PF is compatible with existing methods for further\nperformance gains. Extensive experiments validate the effectiveness of our\nmethod in open-set SSL settings in terms of successfully forming sub-classes,\ndiscriminating OODs from IDs and improving overall accuracy. Codes will be\nreleased.\n","authors":["Xuwei Tan","Yi-Jie Huang","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2308.15575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15564v1","updated":"2023-08-29T18:36:21Z","published":"2023-08-29T18:36:21Z","title":"Learning Sequential Information in Task-based fMRI for Synthetic Data\n Augmentation","summary":" Insufficiency of training data is a persistent issue in medical image\nanalysis, especially for task-based functional magnetic resonance images (fMRI)\nwith spatio-temporal imaging data acquired using specific cognitive tasks. In\nthis paper, we propose an approach for generating synthetic fMRI sequences that\ncan then be used to create augmented training datasets in downstream learning\ntasks. To synthesize high-resolution task-specific fMRI, we adapt the\n$\\alpha$-GAN structure, leveraging advantages of both GAN and variational\nautoencoder models, and propose different alternatives in aggregating temporal\ninformation. The synthetic images are evaluated from multiple perspectives\nincluding visualizations and an autism spectrum disorder (ASD) classification\ntask. The results show that the synthetic task-based fMRI can provide effective\ndata augmentation in learning the ASD classification task.\n","authors":["Jiyao Wang","Nicha C. Dvornek","Lawrence H. Staib","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2308.15564v1.pdf","comment":"Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI\n workshop), preprint version"},{"id":"http://arxiv.org/abs/2308.15559v1","updated":"2023-08-29T18:29:56Z","published":"2023-08-29T18:29:56Z","title":"Glocal Explanations of Expected Goal Models in Soccer","summary":" The expected goal models have gained popularity, but their interpretability\nis often limited, especially when trained using black-box methods. Explainable\nartificial intelligence tools have emerged to enhance model transparency and\nextract descriptive knowledge for a single observation or for all observations.\nHowever, explaining black-box models for a specific group of observations may\nbe more useful in some domains. This paper introduces the glocal explanations\n(between local and global levels) of the expected goal models to enable\nperformance analysis at the team and player levels by proposing the use of\naggregated versions of the SHAP values and partial dependence profiles. This\nallows knowledge to be extracted from the expected goal model for a player or\nteam rather than just a single shot. In addition, we conducted real-data\napplications to illustrate the usefulness of aggregated SHAP and aggregated\nprofiles. The paper concludes with remarks on the potential of these\nexplanations for performance analysis in soccer analytics.\n","authors":["Mustafa Cavus","Adrian Stando","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.15559v1.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.13399v2","updated":"2023-08-29T18:28:13Z","published":"2023-08-25T14:23:40Z","title":"EntropyRank: Unsupervised Keyphrase Extraction via Side-Information\n Optimization for Language Model-based Text Compression","summary":" We propose an unsupervised method to extract keywords and keyphrases from\ntexts based on a pre-trained language model (LM) and Shannon's information\nmaximization. Specifically, our method extracts phrases having the highest\nconditional entropy under the LM. The resulting set of keyphrases turns out to\nsolve a relevant information-theoretic problem: if provided as side\ninformation, it leads to the expected minimal binary code length in compressing\nthe text using the LM and an entropy encoder. Alternately, the resulting set is\nan approximation via a causal LM to the set of phrases that minimize the\nentropy of the text when conditioned upon it. Empirically, the method provides\nresults comparable to the most commonly used methods in various keyphrase\nextraction benchmark challenges.\n","authors":["Alexander Tsvetkov","Alon Kipnis"],"pdf_url":"https://arxiv.org/pdf/2308.13399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07522v3","updated":"2023-08-29T18:24:44Z","published":"2023-07-09T21:16:56Z","title":"The Future of Fundamental Science Led by Generative Closed-Loop\n Artificial Intelligence","summary":" Recent advances in machine learning and AI, including Generative AI and LLMs,\nare disrupting technological innovation, product development, and society as a\nwhole. AI's contribution to technology can come from multiple approaches that\nrequire access to large training data sets and clear performance evaluation\ncriteria, ranging from pattern recognition and classification to generative\nmodels. Yet, AI has contributed less to fundamental science in part because\nlarge data sets of high-quality data for scientific practice and model\ndiscovery are more difficult to access. Generative AI, in general, and Large\nLanguage Models in particular, may represent an opportunity to augment and\naccelerate the scientific discovery of fundamental deep science with\nquantitative models. Here we explore and investigate aspects of an AI-driven,\nautomated, closed-loop approach to scientific discovery, including self-driven\nhypothesis generation and open-ended autonomous exploration of the hypothesis\nspace. Integrating AI-driven automation into the practice of science would\nmitigate current problems, including the replication of findings, systematic\nproduction of data, and ultimately democratisation of the scientific process.\nRealising these possibilities requires a vision for augmented AI coupled with a\ndiversity of AI approaches able to deal with fundamental aspects of causality\nanalysis and model discovery while enabling unbiased search across the space of\nputative explanations. These advances hold the promise to unleash AI's\npotential for searching and discovering the fundamental structure of our world\nbeyond what human scientists have been able to achieve. Such a vision would\npush the boundaries of new fundamental science rather than automatize current\nworkflows and instead open doors for technological innovation to tackle some of\nthe greatest challenges facing humanity today.\n","authors":["Hector Zenil","Jesper Tegnér","Felipe S. Abrahão","Alexander Lavin","Vipin Kumar","Jeremy G. Frey","Adrian Weller","Larisa Soldatova","Alan R. Bundy","Nicholas R. Jennings","Koichi Takahashi","Lawrence Hunter","Saso Dzeroski","Andrew Briggs","Frederick D. Gregory","Carla P. Gomes","Jon Rowe","James Evans","Hiroaki Kitano","Ross King"],"pdf_url":"https://arxiv.org/pdf/2307.07522v3.pdf","comment":"35 pages, first draft of the final report from the Alan Turing\n Institute on AI for Scientific Discovery"},{"id":"http://arxiv.org/abs/2308.15553v1","updated":"2023-08-29T18:19:36Z","published":"2023-08-29T18:19:36Z","title":"Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster\n Analysis","summary":" We introduce usage of a reduction property of penalty-based formulation of\npseudo-Boolean polynomials as a mechanism for invariant dimensionality\nreduction in cluster analysis processes. In our experiments, we show that\nmultidimensional data, like 4-dimensional Iris Flower dataset can be reduced to\n2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer\n(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or\nplanes that lie between reduced samples we can extract clusters in a linear and\nunbiased manner with competitive accuracies, reproducibility and clear\ninterpretation.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin"],"pdf_url":"https://arxiv.org/pdf/2308.15553v1.pdf","comment":"14 pages, 4 figures, submitted to the International Conference Data\n Analysis, Optimization and Their Applications on the Occasion of Boris\n Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region,\n Moscow Institute of Physics and Technology\n https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php"},{"id":"http://arxiv.org/abs/2308.15552v1","updated":"2023-08-29T18:18:21Z","published":"2023-08-29T18:18:21Z","title":"Pure Exploration under Mediators' Feedback","summary":" Stochastic multi-armed bandits are a sequential-decision-making framework,\nwhere, at each interaction step, the learner selects an arm and observes a\nstochastic reward. Within the context of best-arm identification (BAI)\nproblems, the goal of the agent lies in finding the optimal arm, i.e., the one\nwith highest expected reward, as accurately and efficiently as possible.\nNevertheless, the sequential interaction protocol of classical BAI problems,\nwhere the agent has complete control over the arm being pulled at each round,\ndoes not effectively model several decision-making problems of interest (e.g.,\noff-policy learning, partially controllable environments, and human feedback).\nFor this reason, in this work, we propose a novel strict generalization of the\nclassical BAI problem that we refer to as best-arm identification under\nmediators' feedback (BAI-MF). More specifically, we consider the scenario in\nwhich the learner has access to a set of mediators, each of which selects the\narms on the agent's behalf according to a stochastic and possibly unknown\npolicy. The mediator, then, communicates back to the agent the pulled arm\ntogether with the observed reward. In this setting, the agent's goal lies in\nsequentially choosing which mediator to query to identify with high probability\nthe optimal arm while minimizing the identification time, i.e., the sample\ncomplexity. To this end, we first derive and analyze a statistical lower bound\non the sample complexity specific to our general mediator feedback scenario.\nThen, we propose a sequential decision-making strategy for discovering the best\narm under the assumption that the mediators' policies are known to the learner.\nAs our theory verifies, this algorithm matches the lower bound both almost\nsurely and in expectation. Finally, we extend these results to cases where the\nmediators' policies are unknown to the learner obtaining comparable results.\n","authors":["Riccardo Poiani","Alberto Maria Metelli","Marcello Restelli"],"pdf_url":"https://arxiv.org/pdf/2308.15552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15550v1","updated":"2023-08-29T18:17:35Z","published":"2023-08-29T18:17:35Z","title":"Adversarial Style Transfer for Robust Policy Optimization in Deep\n Reinforcement Learning","summary":" This paper proposes an algorithm that aims to improve generalization for\nreinforcement learning agents by removing overfitting to confounding features.\nOur approach consists of a max-min game theoretic objective. A generator\ntransfers the style of observation during reinforcement learning. An additional\ngoal of the generator is to perturb the observation, which maximizes the\nagent's probability of taking a different action. In contrast, a policy network\nupdates its parameters to minimize the effect of such perturbations, thus\nstaying robust while maximizing the expected future reward. Based on this\nsetup, we propose a practical deep reinforcement learning algorithm,\nAdversarial Robust Policy Optimization (ARPO), to find a robust policy that\ngeneralizes to unseen environments. We evaluate our approach on Procgen and\nDistracting Control Suite for generalization and sample efficiency.\nEmpirically, ARPO shows improved performance compared to a few baseline\nalgorithms, including data augmentation.\n","authors":["Md Masudur Rahman","Yexiang Xue"],"pdf_url":"https://arxiv.org/pdf/2308.15550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15513v1","updated":"2023-08-29T16:24:11Z","published":"2023-08-29T16:24:11Z","title":"Tuning the perplexity for and computing sampling-based t-SNE embeddings","summary":" Widely used pipelines for the analysis of high-dimensional data utilize\ntwo-dimensional visualizations. These are created, e.g., via t-distributed\nstochastic neighbor embedding (t-SNE). When it comes to large data sets,\napplying these visualization techniques creates suboptimal embeddings, as the\nhyperparameters are not suitable for large data. Cranking up these parameters\nusually does not work as the computations become too expensive for practical\nworkflows. In this paper, we argue that a sampling-based embedding approach can\ncircumvent these problems. We show that hyperparameters must be chosen\ncarefully, depending on the sampling rate and the intended final embedding.\nFurther, we show how this approach speeds up the computation and increases the\nquality of the embeddings.\n","authors":["Martin Skrodzki","Nicolas Chaves-de-Plaza","Klaus Hildebrandt","Thomas Höllt","Elmar Eisemann"],"pdf_url":"https://arxiv.org/pdf/2308.15513v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2305.02422v3","updated":"2023-08-29T22:12:04Z","published":"2023-05-03T20:29:04Z","title":"GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content","summary":" The mobile cloud gaming industry has been rapidly growing over the last\ndecade. When streaming gaming videos are transmitted to customers' client\ndevices from cloud servers, algorithms that can monitor distorted video quality\nwithout having any reference video available are desirable tools. However,\ncreating No-Reference Video Quality Assessment (NR VQA) models that can\naccurately predict the quality of streaming gaming videos rendered by computer\ngraphics engines is a challenging problem, since gaming content generally\ndiffers statistically from naturalistic videos, often lacks detail, and\ncontains many smooth regions. Until recently, the problem has been further\ncomplicated by the lack of adequate subjective quality databases of mobile\ngaming content. We have created a new gaming-specific NR VQA model called the\nGaming Video Quality Evaluator (GAMIVAL), which combines and leverages the\nadvantages of spatial and temporal gaming distorted scene statistics models, a\nneural noise model, and deep semantic features. Using a support vector\nregression (SVR) as a regressor, GAMIVAL achieves superior performance on the\nnew LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database.\n","authors":["Yu-Chih Chen","Avinab Saha","Chase Davis","Bo Qiu","Xiaoming Wang","Rahul Gowda","Ioannis Katsavounidis","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2305.02422v3.pdf","comment":"Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been\n made available online: https://github.com/lskdream/GAMIVAL"},{"id":"http://arxiv.org/abs/2308.15502v1","updated":"2023-08-29T10:41:34Z","published":"2023-08-29T10:41:34Z","title":"On the Steganographic Capacity of Selected Learning Models","summary":" Machine learning and deep learning models are potential vectors for various\nattack scenarios. For example, previous research has shown that malware can be\nhidden in deep learning models. Hiding information in a learning model can be\nviewed as a form of steganography. In this research, we consider the general\nquestion of the steganographic capacity of learning models. Specifically, for a\nwide range of models, we determine the number of low-order bits of the trained\nparameters that can be overwritten, without adversely affecting model\nperformance. For each model considered, we graph the accuracy as a function of\nthe number of low-order bits that have been overwritten, and for selected\nmodels, we also analyze the steganographic capacity of individual layers. The\nmodels that we test include the classic machine learning techniques of Linear\nRegression (LR) and Support Vector Machine (SVM); the popular general deep\nlearning models of Multilayer Perceptron (MLP) and Convolutional Neural Network\n(CNN); the highly-successful Recurrent Neural Network (RNN) architecture of\nLong Short-Term Memory (LSTM); the pre-trained transfer learning-based models\nVGG16, DenseNet121, InceptionV3, and Xception; and, finally, an Auxiliary\nClassifier Generative Adversarial Network (ACGAN). In all cases, we find that a\nmajority of the bits of each trained parameter can be overwritten before the\naccuracy degrades. Of the models tested, the steganographic capacity ranges\nfrom 7.04 KB for our LR experiments, to 44.74 MB for InceptionV3. We discuss\nthe implications of our results and consider possible avenues for further\nresearch.\n","authors":["Rishit Agrawal","Kelvin Jou","Tanush Obili","Daksh Parikh","Samarth Prajapati","Yash Seth","Charan Sridhar","Nathan Zhang","Mark Stamp"],"pdf_url":"https://arxiv.org/pdf/2308.15502v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.17189"}]},"2023-08-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.06032v2","updated":"2023-08-30T17:57:52Z","published":"2023-08-11T09:23:11Z","title":"Large Language Models in Cryptocurrency Securities Cases: Can ChatGPT\n Replace Lawyers?","summary":" Large Language Models (LLMs) could enhance access to the legal system.\nHowever, empirical research on their effectiveness in conducting legal tasks is\nscant. We study securities cases involving cryptocurrencies as one of numerous\ncontexts where AI could support the legal process, studying LLMs' legal\nreasoning and drafting capabilities. We examine whether a) an LLM can\naccurately determine which laws are potentially being violated from a fact\npattern, and b) whether there is a difference in juror decision-making based on\ncomplaints written by a lawyer compared to an LLM. We feed fact patterns from\nreal-life cases to GPT-3.5 and evaluate its ability to determine correct\npotential violations from the scenario and exclude spurious violations. Second,\nwe had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's\nlegal reasoning skills proved weak, though we expect improvement in future\nmodels, particularly given the violations it suggested tended to be correct (it\nmerely missed additional, correct violations). GPT-3.5 performed better at\nlegal drafting, and jurors' decisions were not statistically significantly\nassociated with the author of the document upon which they based their\ndecisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks,\nthey would be unable to replace lawyers at this stage. However, their drafting\nskills (though, perhaps, still inferior to lawyers), could provide access to\njustice for more individuals by reducing the cost of legal services. Our\nresearch is the first to systematically study LLMs' legal drafting and\nreasoning capabilities in litigation, as well as in securities law and\ncryptocurrency-related misconduct.\n","authors":["Arianna Trozze","Toby Davies","Bennett Kleinberg"],"pdf_url":"https://arxiv.org/pdf/2308.06032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16175v1","updated":"2023-08-30T17:53:25Z","published":"2023-08-30T17:53:25Z","title":"Quantifying Uncertainty in Answers from any Language Model via Intrinsic\n and Extrinsic Confidence Assessment","summary":" We introduce BSDetector, a method for detecting bad and speculative answers\nfrom a pretrained Large Language Model by estimating a numeric confidence score\nfor any output it generated. Our uncertainty quantification technique works for\nany LLM accessible only via a black-box API, and combines intrinsic and\nextrinsic assessments of confidence into a single trustworthiness estimate for\nany LLM response to a given prompt. Our method is extremely general and can\napplied to all of the best LLMs available today (whose training data remains\nunknown). By expending a bit of extra computation, users of any LLM API can now\nget the same response as they would ordinarily, as well as a confidence\nestimate that caution when not to trust this response. Experiments on both\nclosed and open-form Question-Answer benchmarks reveal that BSDetector more\naccurately identifies incorrect LLM responses than alternative uncertainty\nestimation procedures (for both GPT-3 and ChatGPT). By sampling multiple\nresponses from the LLM and considering the one with the highest confidence\nscore, we can additionally obtain more accurate responses from the same LLM,\nwithout any extra training steps.\n","authors":["Jiuhai Chen","Jonas Mueller"],"pdf_url":"https://arxiv.org/pdf/2308.16175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17590v2","updated":"2023-08-30T17:46:17Z","published":"2023-03-30T17:57:43Z","title":"Going Beyond Nouns With Vision & Language Models Using Synthetic Data","summary":" Large-scale pre-trained Vision & Language (VL) models have shown remarkable\nperformance in many applications, enabling replacing a fixed set of supported\nclasses with zero-shot open vocabulary reasoning over (almost arbitrary)\nnatural language prompts. However, recent works have uncovered a fundamental\nweakness of these models. For example, their difficulty to understand Visual\nLanguage Concepts (VLC) that go 'beyond nouns' such as the meaning of\nnon-object words (e.g., attributes, actions, relations, states, etc.), or\ndifficulty in performing compositional reasoning such as understanding the\nsignificance of the order of the words in a sentence. In this work, we\ninvestigate to which extent purely synthetic data could be leveraged to teach\nthese models to overcome such shortcomings without compromising their zero-shot\ncapabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale\nsynthetic dataset and data generation codebase allowing to generate additional\nsuitable data to improve VLC understanding and compositional reasoning of VL\nmodels. Additionally, we propose a general VL finetuning strategy for\neffectively leveraging SyViC towards achieving these improvements. Our\nextensive experiments and ablations on VL-Checklist, Winoground, and ARO\nbenchmarks demonstrate that it is possible to adapt strong pre-trained VL\nmodels with synthetic data significantly enhancing their VLC understanding\n(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their\nzero-shot accuracy.\n","authors":["Paola Cascante-Bonilla","Khaled Shehada","James Seale Smith","Sivan Doveh","Donghyun Kim","Rameswar Panda","Gül Varol","Aude Oliva","Vicente Ordonez","Rogerio Feris","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2303.17590v2.pdf","comment":"Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/"},{"id":"http://arxiv.org/abs/2308.16149v1","updated":"2023-08-30T17:07:17Z","published":"2023-08-30T17:07:17Z","title":"Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open\n Generative Large Language Models","summary":" We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric\nfoundation and instruction-tuned open generative large language models (LLMs).\nThe models are based on the GPT-3 decoder-only architecture and are pretrained\non a mixture of Arabic and English texts, including source code in various\nprogramming languages. With 13 billion parameters, they demonstrate better\nknowledge and reasoning capabilities in Arabic than any existing open Arabic\nand multilingual models by a sizable margin, based on extensive evaluation.\nMoreover, the models are competitive in English compared to English-centric\nopen models of similar size, despite being trained on much less English data.\nWe provide a detailed description of the training, the tuning, the safety\nalignment, and the evaluation of the models. We release two open versions of\nthe model -- the foundation Jais model, and an instruction-tuned Jais-chat\nvariant -- with the aim of promoting research on Arabic LLMs. Available at\nhttps://huggingface.co/inception-mbzuai/jais-13b-chat\n","authors":["Neha Sengupta","Sunil Kumar Sahu","Bokang Jia","Satheesh Katipomu","Haonan Li","Fajri Koto","Osama Mohammed Afzal","Samta Kamboj","Onkar Pandit","Rahul Pal","Lalit Pradhan","Zain Muhammad Mujahid","Massa Baali","Alham Fikri Aji","Zhengzhong Liu","Andy Hock","Andrew Feldman","Jonathan Lee","Andrew Jackson","Preslav Nakov","Timothy Baldwin","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2308.16149v1.pdf","comment":"Arabic-centric, foundation model, large-language model, LLM,\n generative model, instruction-tuned, Jais, Jais-chat"},{"id":"http://arxiv.org/abs/2308.16137v1","updated":"2023-08-30T16:47:51Z","published":"2023-08-30T16:47:51Z","title":"LM-Infinite: Simple On-the-Fly Length Generalization for Large Language\n Models","summary":" In recent years, there have been remarkable advancements in the performance\nof Transformer-based Large Language Models (LLMs) across various domains. As\nthese LLMs are deployed for increasingly complex tasks, they often face the\nneeds to conduct longer reasoning processes or understanding larger contexts.\nIn these situations, the length generalization failure of LLMs on long\nsequences become more prominent. Most pre-training schemes truncate training\nsequences to a fixed length (such as 2048 for LLaMa). LLMs often struggle to\ngenerate fluent texts, let alone carry out downstream tasks, after longer\ncontexts, even with relative positional encoding which is designed to cope with\nthis problem. Common solutions such as finetuning on longer corpora often\ninvolves daunting hardware and time costs and requires careful training process\ndesign. To more efficiently leverage the generation capacity of existing LLMs,\nwe theoretically and empirically investigate the main out-of-distribution (OOD)\nfactors contributing to this problem. Inspired by this diagnosis, we propose a\nsimple yet effective solution for on-the-fly length generalization,\nLM-Infinite, which involves only a $\\Lambda$-shaped attention mask and a\ndistance limit while requiring no parameter updates or learning. We find it\napplicable to a variety of LLMs using relative-position encoding methods.\nLM-Infinite is computational efficient with $O(n)$ time and space, and\ndemonstrates consistent fluency and generation quality to as long as 32k tokens\non ArXiv and OpenWebText2 datasets, with 2.72x decoding speedup. On downstream\ntask such as passkey retrieval, it continues to work on inputs much longer than\ntraining lengths where vanilla models fail immediately.\n","authors":["Chi Han","Qifan Wang","Wenhan Xiong","Yu Chen","Heng Ji","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16137v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.17680v4","updated":"2023-08-30T16:17:27Z","published":"2023-05-28T10:05:13Z","title":"Evaluating GPT-3 Generated Explanations for Hateful Content Moderation","summary":" Recent research has focused on using large language models (LLMs) to generate\nexplanations for hate speech through fine-tuning or prompting. Despite the\ngrowing interest in this area, these generated explanations' effectiveness and\npotential limitations remain poorly understood. A key concern is that these\nexplanations, generated by LLMs, may lead to erroneous judgments about the\nnature of flagged content by both users and content moderators. For instance,\nan LLM-generated explanation might inaccurately convince a content moderator\nthat a benign piece of content is hateful. In light of this, we propose an\nanalytical framework for examining hate speech explanations and conducted an\nextensive survey on evaluating such explanations. Specifically, we prompted\nGPT-3 to generate explanations for both hateful and non-hateful content, and a\nsurvey was conducted with 2,400 unique respondents to evaluate the generated\nexplanations. Our findings reveal that (1) human evaluators rated the\nGPT-generated explanations as high quality in terms of linguistic fluency,\ninformativeness, persuasiveness, and logical soundness, (2) the persuasive\nnature of these explanations, however, varied depending on the prompting\nstrategy employed, and (3) this persuasiveness may result in incorrect\njudgments about the hatefulness of the content. Our study underscores the need\nfor caution in applying LLM-generated explanations for content moderation. Code\nand results are available at https://github.com/Social-AI-Studio/GPT3-HateEval.\n","authors":["Han Wang","Ming Shan Hee","Md Rabiul Awal","Kenny Tsu Wei Choo","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2305.17680v4.pdf","comment":"9 pages, 2 figures, Accepted by International Joint Conference on\n Artificial Intelligence(IJCAI)"},{"id":"http://arxiv.org/abs/2308.16118v1","updated":"2023-08-30T16:17:26Z","published":"2023-08-30T16:17:26Z","title":"Response: Emergent analogical reasoning in large language models","summary":" In their recent Nature Human Behaviour paper, \"Emergent analogical reasoning\nin large language models,\" (Webb, Holyoak, and Lu, 2023) the authors argue that\n\"large language models such as GPT-3 have acquired an emergent ability to find\nzero-shot solutions to a broad range of analogy problems.\" In this response, we\nprovide counterexamples of the letter string analogies. In our tests, GPT-3\nfails to solve even the easiest variants of the problems presented in the\noriginal paper. Zero-shot reasoning is an extraordinary claim that requires\nextraordinary evidence. We do not see that evidence in our experiments. To\nstrengthen claims of humanlike reasoning such as zero-shot reasoning, it is\nimportant that the field develop approaches that rule out data memorization.\n","authors":["Damian Hodel","Jevin West"],"pdf_url":"https://arxiv.org/pdf/2308.16118v1.pdf","comment":"Response to publication in Nature Human Behaviour titled \"Emergent\n analogical reasoning in large language models,\" (Webb, Holyoak, and Lu, 2023,\n arXiv:2212.09196). 9 pages"},{"id":"http://arxiv.org/abs/2308.14359v2","updated":"2023-08-30T16:08:28Z","published":"2023-08-28T07:11:27Z","title":"Effect of Attention and Self-Supervised Speech Embeddings on\n Non-Semantic Speech Tasks","summary":" Human emotion understanding is pivotal in making conversational technology\nmainstream. We view speech emotion understanding as a perception task which is\na more realistic setting. With varying contexts (languages, demographics, etc.)\ndifferent share of people perceive the same speech segment as a non-unanimous\nemotion. As part of the ACM Multimedia 2023 Computational Paralinguistics\nChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset\nof multilingual speakers and multi-label regression target of 'emotion share'\nor perception of that emotion. We demonstrate that the training scheme of\ndifferent foundation models dictates their effectiveness for tasks beyond\nspeech recognition, especially for non-semantic speech tasks like emotion\nunderstanding. This is a very complex task due to multilingual speakers,\nvariability in the target labels, and inherent imbalance in the regression\ndataset. Our results show that HuBERT-Large with a self-attention-based\nlight-weight sequence model provides 4.6% improvement over the reported\nbaseline.\n","authors":["Payal Mohapatra","Akash Pandey","Yueyuan Sui","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14359v2.pdf","comment":"Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges\n Track"},{"id":"http://arxiv.org/abs/2308.16109v1","updated":"2023-08-30T16:04:54Z","published":"2023-08-30T16:04:54Z","title":"Grandma Karl is 27 years old -- research agenda for pseudonymization of\n research data","summary":" Accessibility of research data is critical for advances in many research\nfields, but textual data often cannot be shared due to the personal and\nsensitive information which it contains, e.g names or political opinions.\nGeneral Data Protection Regulation (GDPR) suggests pseudonymization as a\nsolution to secure open access to research data, but we need to learn more\nabout pseudonymization as an approach before adopting it for manipulation of\nresearch data. This paper outlines a research agenda within pseudonymization,\nnamely need of studies into the effects of pseudonymization on unstructured\ndata in relation to e.g. readability and language assessment, as well as the\neffectiveness of pseudonymization as a way of protecting writer identity, while\nalso exploring different ways of developing context-sensitive algorithms for\ndetection, labelling and replacement of personal information in unstructured\ndata. The recently granted project on pseudonymization Grandma Karl is 27 years\nold addresses exactly those challenges.\n","authors":["Elena Volodina","Simon Dobnik","Therese Lindström Tiedemann","Xuan-Son Vu"],"pdf_url":"https://arxiv.org/pdf/2308.16109v1.pdf","comment":"Big DataService 2023 conference, 2023 Workshop on Big Data and\n Machine Learning with Privacy Enhancing Tech, IEEE Catalog Number:\n CFP23A91-ART, ISBN: 979-8-3503-3379-4"},{"id":"http://arxiv.org/abs/2307.15745v2","updated":"2023-08-30T15:58:56Z","published":"2023-07-28T18:01:08Z","title":"Context-VQA: Towards Context-Aware and Purposeful Visual Question\n Answering","summary":" Visual question answering (VQA) has the potential to make the Internet more\naccessible in an interactive way, allowing people who cannot see images to ask\nquestions about them. However, multiple studies have shown that people who are\nblind or have low-vision prefer image explanations that incorporate the context\nin which an image appears, yet current VQA datasets focus on images in\nisolation. We argue that VQA models will not fully succeed at meeting people's\nneeds unless they take context into account. To further motivate and analyze\nthe distinction between different contexts, we introduce Context-VQA, a VQA\ndataset that pairs images with contexts, specifically types of websites (e.g.,\na shopping website). We find that the types of questions vary systematically\nacross contexts. For example, images presented in a travel context garner 2\ntimes more \"Where?\" questions, and images on social media and news garner 2.8\nand 1.8 times more \"Who?\" questions than the average. We also find that context\neffects are especially important when participants can't see the image. These\nresults demonstrate that context affects the types of questions asked and that\nVQA models should be context-sensitive to better meet people's needs,\nespecially in accessibility settings.\n","authors":["Nandita Naik","Christopher Potts","Elisa Kreiss"],"pdf_url":"https://arxiv.org/pdf/2307.15745v2.pdf","comment":"Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision\n and Language"},{"id":"http://arxiv.org/abs/2305.09438v3","updated":"2023-08-30T14:56:16Z","published":"2023-05-16T13:50:24Z","title":"MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with\n Transformers","summary":" Message Passing Interface (MPI) plays a crucial role in distributed memory\nparallelization across multiple nodes. However, parallelizing MPI code\nmanually, and specifically, performing domain decomposition, is a challenging,\nerror-prone task. In this paper, we address this problem by developing\nMPI-RICAL, a novel data-driven, programming-assistance tool that assists\nprogrammers in writing domain decomposition based distributed memory\nparallelization code. Specifically, we train a supervised language model to\nsuggest MPI functions and their proper locations in the code on the fly. We\nalso introduce MPICodeCorpus, the first publicly available corpus of MPI-based\nparallel programs that is created by mining more than 15,000 open-source\nrepositories on GitHub. Experimental results have been done on MPICodeCorpus\nand more importantly, on a compiled benchmark of MPI-based parallel programs\nfor numerical computations that represent real-world scientific applications.\nMPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating\nits accuracy in suggesting correct MPI functions at appropriate code\nlocations.. The source code used in this work, as well as other relevant\nsources, are available at:\nhttps://github.com/Scientific-Computing-Lab-NRCN/MPI-rical\n","authors":["Nadav Schneider","Tal Kadosh","Niranjan Hasabnis","Timothy Mattson","Yuval Pinter","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2305.09438v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10390v3","updated":"2023-08-30T14:55:01Z","published":"2023-08-20T23:47:23Z","title":"LibriSQA: Advancing Free-form and Open-ended Spoken Question Answering\n with a Novel Dataset and Framework","summary":" While Large Language Models (LLMs) have demonstrated commendable performance\nacross a myriad of domains and tasks, existing LLMs still exhibit a palpable\ndeficit in handling multimodal functionalities, especially for the Spoken\nQuestion Answering (SQA) task which necessitates precise alignment and deep\ninteraction between speech and text features. To address the SQA challenge on\nLLMs, we initially curated the free-form and open-ended LibriSQA dataset from\nLibrispeech, comprising Part I with natural conversational formats and Part II\nencompassing multiple-choice questions followed by answers and analytical\nsegments. Both parts collectively include 107k SQA pairs that cover various\ntopics. Given the evident paucity of existing speech-text LLMs, we propose a\nlightweight, end-to-end framework to execute the SQA task on the LibriSQA,\nwitnessing significant results. By reforming ASR into the SQA format, we\nfurther substantiate our framework's capability in handling ASR tasks. Our\nempirical findings bolster the LLMs' aptitude for aligning and comprehending\nmultimodal information, paving the way for the development of universal\nmultimodal LLMs. The dataset and demo can be found at\nhttps://github.com/ZihanZhaoSJTU/LibriSQA.\n","authors":["Zihan Zhao","Yiyang Jiang","Heyang Liu","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10390v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16075v1","updated":"2023-08-30T14:52:14Z","published":"2023-08-30T14:52:14Z","title":"Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for\n English to Indian Languages","summary":" The study investigates the effectiveness of utilizing multimodal information\nin Neural Machine Translation (NMT). While prior research focused on using\nmultimodal data in low-resource scenarios, this study examines how image\nfeatures impact translation when added to a large-scale, pre-trained unimodal\nNMT system. Surprisingly, the study finds that images might be redundant in\nthis context. Additionally, the research introduces synthetic noise to assess\nwhether images help the model deal with textual noise. Multimodal models\nslightly outperform text-only models in noisy settings, even with random\nimages. The study's experiments translate from English to Hindi, Bengali, and\nMalayalam, outperforming state-of-the-art benchmarks significantly.\nInterestingly, the effect of visual context varies with source text noise: no\nvisual context works best for non-noisy translations, cropped image features\nare optimal for low noise, and full image features work better in high-noise\nscenarios. This sheds light on the role of visual context, especially in noisy\nsettings, opening up a new research direction for Noisy Neural Machine\nTranslation in multimodal setups. The research emphasizes the importance of\ncombining visual and textual information for improved translation in various\nenvironments.\n","authors":["Baban Gain","Dibyanayan Bandyopadhyay","Samrat Mukherjee","Chandranath Adak","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2308.16075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16061v1","updated":"2023-08-30T14:36:25Z","published":"2023-08-30T14:36:25Z","title":"Conti Inc.: Understanding the Internal Discussions of a large\n Ransomware-as-a-Service Operator with Machine Learning","summary":" Ransomware-as-a-service (RaaS) is increasing the scale and complexity of\nransomware attacks. Understanding the internal operations behind RaaS has been\na challenge due to the illegality of such activities. The recent chat leak of\nthe Conti RaaS operator, one of the most infamous ransomware operators on the\ninternational scene, offers a key opportunity to better understand the inner\nworkings of such organizations. This paper analyzes the main topic discussions\nin the Conti chat leak using machine learning techniques such as Natural\nLanguage Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as\nvisualization strategies. Five discussion topics are found: 1) Business, 2)\nTechnical, 3) Internal tasking/Management, 4) Malware, and 5) Customer\nService/Problem Solving. Moreover, the distribution of topics among Conti\nmembers shows that only 4% of individuals have specialized discussions while\nalmost all individuals (96%) are all-rounders, meaning that their discussions\nrevolve around the five topics. The results also indicate that a significant\nproportion of Conti discussions are non-tech related. This study thus\nhighlights that running such large RaaS operations requires a workforce skilled\nbeyond technical abilities, with individuals involved in various tasks, from\nmanagement to customer service or problem solving. The discussion topics also\nshow that the organization behind the Conti RaaS oper5086933ator shares\nsimilarities with a large firm. We conclude that, although RaaS represents an\nexample of specialization in the cybercrime industry, only a few members are\nspecialized in one topic, while the rest runs and coordinates the RaaS\noperation.\n","authors":["Estelle Ruellan","Masarah Paquet-Clouston","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.16061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16060v1","updated":"2023-08-30T14:33:25Z","published":"2023-08-30T14:33:25Z","title":"Text-to-OverpassQL: A Natural Language Interface for Complex Geodata\n Querying of OpenStreetMap","summary":" We present Text-to-OverpassQL, a task designed to facilitate a natural\nlanguage interface for querying geodata from OpenStreetMap (OSM). The Overpass\nQuery Language (OverpassQL) allows users to formulate complex database queries\nand is widely adopted in the OSM ecosystem. Generating Overpass queries from\nnatural language input serves multiple use-cases. It enables novice users to\nutilize OverpassQL without prior knowledge, assists experienced users with\ncrafting advanced queries, and enables tool-augmented large language models to\naccess information stored in the OSM database. In order to assess the\nperformance of current sequence generation models on this task, we propose\nOverpassNL, a dataset of 8,352 queries with corresponding natural language\ninputs. We further introduce task specific evaluation metrics and ground the\nevaluation of the Text-to-OverpassQL task by executing the queries against the\nOSM database. We establish strong baselines by finetuning sequence-to-sequence\nmodels and adapting large language models with in-context examples. The\ndetailed evaluation reveals strengths and weaknesses of the considered learning\nstrategies, laying the foundations for further research into the\nText-to-OverpassQL task.\n","authors":["Michael Staniek","Raphael Schumann","Maike Züfle","Stefan Riezler"],"pdf_url":"https://arxiv.org/pdf/2308.16060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16055v1","updated":"2023-08-30T14:24:16Z","published":"2023-08-30T14:24:16Z","title":"AsyncET: Asynchronous Learning for Knowledge Graph Entity Typing with\n Auxiliary Relations","summary":" Knowledge graph entity typing (KGET) is a task to predict the missing entity\ntypes in knowledge graphs (KG). Previously, KG embedding (KGE) methods tried to\nsolve the KGET task by introducing an auxiliary relation, 'hasType', to model\nthe relationship between entities and their types. However, a single auxiliary\nrelation has limited expressiveness for diverse entity-type patterns. We\nimprove the expressiveness of KGE methods by introducing multiple auxiliary\nrelations in this work. Similar entity types are grouped to reduce the number\nof auxiliary relations and improve their capability to model entity-type\npatterns with different granularities. With the presence of multiple auxiliary\nrelations, we propose a method adopting an Asynchronous learning scheme for\nEntity Typing, named AsyncET, which updates the entity and type embeddings\nalternatively to keep the learned entity embedding up-to-date and informative\nfor entity type prediction. Experiments are conducted on two commonly used KGET\ndatasets to show that the performance of KGE methods on the KGET task can be\nsubstantially improved by the proposed multiple auxiliary relations and\nasynchronous embedding learning. Furthermore, our method has a significant\nadvantage over state-of-the-art methods in model sizes and time complexity.\n","authors":["Yun-Cheng Wang","Xiou Ge","Bin Wang","C. -C. Jay Kuo"],"pdf_url":"https://arxiv.org/pdf/2308.16055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17926v2","updated":"2023-08-30T13:22:35Z","published":"2023-05-29T07:41:03Z","title":"Large Language Models are not Fair Evaluators","summary":" In this paper, we uncover a systematic bias in the evaluation paradigm of\nadopting large language models~(LLMs), e.g., GPT-4, as a referee to score and\ncompare the quality of responses generated by candidate models. We find that\nthe quality ranking of candidate responses can be easily hacked by simply\naltering their order of appearance in the context. This manipulation allows us\nto skew the evaluation result, making one model appear considerably superior to\nthe other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries\nwith ChatGPT as an evaluator. To address this issue, we propose a calibration\nframework with three simple yet effective strategies: 1) Multiple Evidence\nCalibration, which requires the evaluator model to generate multiple evaluation\nevidence before assigning ratings; 2) Balanced Position Calibration, which\naggregates results across various orders to determine the final score; 3)\nHuman-in-the-Loop Calibration, which introduces a balanced position diversity\nentropy to measure the difficulty of each example and seeks human assistance\nwhen needed. We also manually annotate the \"win/tie/lose\" outcomes of responses\nfrom ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and\nextensive experiments demonstrate that our approach successfully mitigates\nevaluation bias, resulting in closer alignment with human judgments. We release\nour code and human annotation at \\url{https://github.com/i-Eval/FairEval} to\nfacilitate future research.\n","authors":["Peiyi Wang","Lei Li","Liang Chen","Zefan Cai","Dawei Zhu","Binghuai Lin","Yunbo Cao","Qi Liu","Tianyu Liu","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2305.17926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13081v2","updated":"2023-08-30T13:16:46Z","published":"2023-08-24T20:57:07Z","title":"Formal specification terminology for demographic agent-based models of\n fixed-step single-clocked simulations","summary":" This document presents adequate formal terminology for the mathematical\nspecification of a subset of Agent Based Models (ABMs) in the field of\nDemography. The simulation of the targeted ABMs follows a fixed-step\nsingle-clocked pattern. The proposed terminology further improves the model\nunderstanding and can act as a stand-alone methodology for the specification\nand optionally the documentation of a significant set of (demographic) ABMs.\nNevertheless, it is imaginable the this terminology probably with further\nextensions can be merged with the largely-informal widely-used model\ndocumentation and communication O.D.D. protocol [Grimm and et al., 2020,\nAmouroux et al., 2010] to reduce many sources of ambiguity, hindering model\nreplications by other modelers. A published demographic model documentation,\nlargely simplified version of the Lone Parent Model [Gostoli and Silverman,\n2020] is separately published in [Elsheikh, 2023b] as illustration for the\nformal terminology. The model was implemented in the Julia language [Elsheikh,\n2023a] based on the Agents.jl julia package [Datseris et al., 2022].\n","authors":["Atiyah Elsheikh"],"pdf_url":"https://arxiv.org/pdf/2308.13081v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2307.16548"},{"id":"http://arxiv.org/abs/2308.15214v2","updated":"2023-08-30T13:13:19Z","published":"2023-08-29T11:08:40Z","title":"FurChat: An Embodied Conversational Agent using LLMs, Combining Open and\n Closed-Domain Dialogue with Facial Expressions","summary":" We demonstrate an embodied conversational agent that can function as a\nreceptionist and generate a mixture of open and closed-domain dialogue along\nwith facial expressions, by using a large language model (LLM) to develop an\nengaging conversation. We deployed the system onto a Furhat robot, which is\nhighly expressive and capable of using both verbal and nonverbal cues during\ninteraction. The system was designed specifically for the National Robotarium\nto interact with visitors through natural conversations, providing them with\ninformation about the facilities, research, news, upcoming events, etc. The\nsystem utilises the state-of-the-art GPT-3.5 model to generate such information\nalong with domain-general conversations and facial expressions based on prompt\nengineering.\n","authors":["Neeraj Cherakara","Finny Varghese","Sheena Shabana","Nivan Nelson","Abhiram Karukayil","Rohith Kulothungan","Mohammed Afil Farhan","Birthe Nesset","Meriam Moujahid","Tanvi Dinkar","Verena Rieser","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2308.15214v2.pdf","comment":"5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the\n Special Interest Group on Discourse and Dialogue), for the demo video, see\n https://youtu.be/fwtUl1kl22s"},{"id":"http://arxiv.org/abs/2211.02423v2","updated":"2023-08-30T12:30:33Z","published":"2022-11-04T12:56:12Z","title":"CLSE: Corpus of Linguistically Significant Entities","summary":" One of the biggest challenges of natural language generation (NLG) is the\nproper handling of named entities. Named entities are a common source of\ngrammar mistakes such as wrong prepositions, wrong article handling, or\nincorrect entity inflection. Without factoring linguistic representation, such\nerrors are often underrepresented when evaluating on a small set of arbitrarily\npicked argument values, or when translating a dataset from a linguistically\nsimpler language, like English, to a linguistically complex language, like\nRussian. However, for some applications, broadly precise grammatical\ncorrectness is critical -- native speakers may find entity-related grammar\nerrors silly, jarring, or even offensive.\n To enable the creation of more linguistically diverse NLG datasets, we\nrelease a Corpus of Linguistically Significant Entities (CLSE) annotated by\nlinguist experts. The corpus includes 34 languages and covers 74 different\nsemantic types to support various applications from airline ticketing to video\ngames. To demonstrate one possible use of CLSE, we produce an augmented version\nof the Schema-Guided Dialog Dataset, SGD-CLSE. Using the CLSE's entities and a\nsmall number of human translations, we create a linguistically representative\nNLG evaluation benchmark in three languages: French (high-resource), Marathi\n(low-resource), and Russian (highly inflected language). We establish quality\nbaselines for neural, template-based, and hybrid NLG systems and discuss the\nstrengths and weaknesses of each approach.\n","authors":["Aleksandr Chuklin","Justin Zhao","Mihir Kale"],"pdf_url":"https://arxiv.org/pdf/2211.02423v2.pdf","comment":"Proceedings of the 2nd Workshop on Natural Language Generation,\n Evaluation, and Metrics (GEM 2022) at EMNLP 2022"},{"id":"http://arxiv.org/abs/2308.15987v1","updated":"2023-08-30T12:18:18Z","published":"2023-08-30T12:18:18Z","title":"FPTQ: Fine-grained Post-Training Quantization for Large Language Models","summary":" In the era of large-scale language models, the substantial parameter size\nposes significant challenges for deployment. Being a prevalent compression\ntechnique, quantization has emerged as the mainstream practice to tackle this\nissue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and\nactivations in such bit widths). In this study, we propose a novel W4A8\npost-training quantization method for the available open-sourced LLMs, which\ncombines the advantages of both two recipes. Therefore, we can leverage the\nbenefit in the I/O utilization of 4-bit weight quantization and the\nacceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces\nnotorious performance degradation. As a remedy, we involve layerwise activation\nquantization strategies which feature a novel logarithmic equalization for most\nintractable layers, and we combine them with fine-grained weight quantization.\nWithout whistles and bells, we eliminate the necessity for further fine-tuning\nand obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and\nLLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is\nachievable for the deployment of large language models, fostering their\nwide-spreading real-world applications.\n","authors":["Qingyuan Li","Yifan Zhang","Liang Li","Peng Yao","Bo Zhang","Xiangxiang Chu","Yerui Sun","Li Du","Yuchen Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15982v1","updated":"2023-08-30T12:10:17Z","published":"2023-08-30T12:10:17Z","title":"MerA: Merging Pretrained Adapters For Few-Shot Learning","summary":" Adapter tuning, which updates only a few parameters, has become a mainstream\nmethod for fine-tuning pretrained language models to downstream tasks. However,\nit often yields subpar results in few-shot learning. AdapterFusion, which\nassembles pretrained adapters using composition layers tailored to specific\ntasks, is a possible solution but significantly increases trainable parameters\nand deployment costs. Despite this, our preliminary study reveals that even\nsingle adapters can outperform Adapterfusion in few-shot learning, urging us to\npropose \\textbf{\\texttt{Merging Pretrained Adapters}} (MerA) that efficiently\nincorporates pretrained adapters to a single model through model fusion.\nExtensive experiments on two PLMs demonstrate that MerA achieves substantial\nimprovements compared to both single adapters and AdapterFusion. To further\nenhance the capacity of MerA, we also introduce a simple yet effective\ntechnique, referred to as the \"\\textit{same-track}\" setting, that merges\nadapters from the same track of pretraining tasks. With the implementation of\nthe \"\\textit{same-track}\" setting, we observe even more impressive gains,\nsurpassing the performance of both full fine-tuning and adapter tuning by a\nsubstantial margin, e.g., 3.5\\% in MRPC and 5.0\\% in MNLI.\n","authors":["Shwai He","Run-Ze Fan","Liang Ding","Li Shen","Tianyi Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2308.15982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15961v1","updated":"2023-08-30T11:35:21Z","published":"2023-08-30T11:35:21Z","title":"Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting","summary":" The task of radiology reporting comprises describing and interpreting the\nmedical findings in radiographic images, including description of their\nlocation and appearance. Automated approaches to radiology reporting require\nthe image to be encoded into a suitable token representation for input to the\nlanguage model. Previous methods commonly use convolutional neural networks to\nencode an image into a series of image-level feature map representations.\nHowever, the generated reports often exhibit realistic style but imperfect\naccuracy. Inspired by recent works for image captioning in the general domain\nin which each visual token corresponds to an object detected in an image, we\ninvestigate whether using local tokens corresponding to anatomical structures\ncan improve the quality of the generated reports. We introduce a novel\nadaptation of Faster R-CNN in which finding detection is performed for the\ncandidate bounding boxes extracted during anatomical structure localisation. We\nuse the resulting bounding box feature representations as our set of\nfinding-aware anatomical tokens. This encourages the extracted anatomical\ntokens to be informative about the findings they contain (required for the\nfinal task of radiology reporting). Evaluating on the MIMIC-CXR dataset of\nchest X-Ray images, we show that task-aware anatomical tokens give\nstate-of-the-art performance when integrated into an automated reporting\npipeline, yielding generated reports with improved clinical accuracy.\n","authors":["Francesco Dalla Serra","Chaoyang Wang","Fani Deligianni","Jeffrey Dalton","Alison Q. O'Neil"],"pdf_url":"https://arxiv.org/pdf/2308.15961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15952v1","updated":"2023-08-30T11:02:26Z","published":"2023-08-30T11:02:26Z","title":"Benchmarking Multilabel Topic Classification in the Kyrgyz Language","summary":" Kyrgyz is a very underrepresented language in terms of modern natural\nlanguage processing resources. In this work, we present a new public benchmark\nfor topic classification in Kyrgyz, introducing a dataset based on collected\nand annotated data from the news site 24.KG and presenting several baseline\nmodels for news classification in the multilabel setting. We train and evaluate\nboth classical statistical and neural models, reporting the scores, discussing\nthe results, and proposing directions for future work.\n","authors":["Anton Alekseev","Sergey I. Nikolenko","Gulnara Kabaeva"],"pdf_url":"https://arxiv.org/pdf/2308.15952v1.pdf","comment":"Accepted to AIST 2023"},{"id":"http://arxiv.org/abs/2308.09662v3","updated":"2023-08-30T10:21:00Z","published":"2023-08-18T16:27:04Z","title":"Red-Teaming Large Language Models using Chain of Utterances for\n Safety-Alignment","summary":" Larger language models (LLMs) have taken the world by storm with their\nmassive multi-tasking capabilities simply by optimizing over a next-word\nprediction objective. With the emergence of their properties and encoded\nknowledge, the risk of LLMs producing harmful outputs increases, making them\nunfit for scalable deployment for the public. In this work, we propose a new\nsafety evaluation benchmark RED-EVAL that carries out red-teaming. We show that\neven widely deployed models are susceptible to the Chain of Utterances-based\n(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and\nChatGPT to unethically respond to more than 65% and 73% of harmful queries. We\nalso demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in\ngenerating harmful responses in more than 86% of the red-teaming attempts.\nNext, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It\nconstitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting,\nwe collect a dataset that consists of 1.9K harmful questions covering a wide\nrange of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2)\nSAFE-ALIGN: We demonstrate how the conversational dataset can be used for the\nsafety alignment of LLMs by minimizing the negative log-likelihood over helpful\nresponses and penalizing over harmful responses by gradient accent over sample\nloss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely\naligned when evaluated on RED-EVAL and HHH benchmarks while preserving the\nutility of the baseline models (TruthfulQA, MMLU, and BBH).\n","authors":["Rishabh Bhardwaj","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2308.09662v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15930v1","updated":"2023-08-30T10:12:39Z","published":"2023-08-30T10:12:39Z","title":"LLaSM: Large Language and Speech Model","summary":" Multi-modal large language models have garnered significant interest\nrecently. Though, most of the works focus on vision-language multi-modal models\nproviding strong capabilities in following vision-and-language instructions.\nHowever, we claim that speech is also an important modality through which\nhumans interact with the world. Hence, it is crucial for a general-purpose\nassistant to be able to follow multi-modal speech-and-language instructions. In\nthis work, we propose Large Language and Speech Model (LLaSM). LLaSM is an\nend-to-end trained large multi-modal speech-language model with cross-modal\nconversational abilities, capable of following speech-and-language\ninstructions. Our early experiments show that LLaSM demonstrates a more\nconvenient and natural way for humans to interact with artificial intelligence.\nSpecifically, we also release a large Speech Instruction Following dataset\nLLaSM-Audio-Instructions. Code and demo are available at\nhttps://github.com/LinkSoul-AI/LLaSM and\nhttps://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions\ndataset is available at\nhttps://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions.\n","authors":["Yu Shu","Siwei Dong","Guangyao Chen","Wenhao Huang","Ruihua Zhang","Daochen Shi","Qiqi Xiang","Yemin Shi"],"pdf_url":"https://arxiv.org/pdf/2308.15930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15906v1","updated":"2023-08-30T09:19:06Z","published":"2023-08-30T09:19:06Z","title":"Is the U.S. Legal System Ready for AI's Challenges to Human Values?","summary":" Our interdisciplinary study investigates how effectively U.S. laws confront\nthe challenges posed by Generative AI to human values. Through an analysis of\ndiverse hypothetical scenarios crafted during an expert workshop, we have\nidentified notable gaps and uncertainties within the existing legal framework\nregarding the protection of fundamental values, such as autonomy, privacy,\ndignity, diversity, equality, and physical/mental well-being. Constitutional\nand civil rights, it appears, may not provide sufficient protection against\nAI-generated discriminatory outputs. Furthermore, even if we exclude the\nliability shield provided by Section 230, proving causation for defamation and\nproduct liability claims is a challenging endeavor due to the intricate and\nopaque nature of AI systems. To address the unique and unforeseeable threats\nposed by Generative AI, we advocate for legal frameworks that evolve to\nrecognize new threat and provide proactive, auditable guidelines to industry\nstakeholders. Addressing these issues requires deep interdisciplinary\ncollaborations to identify harms, values, and mitigation strategies.\n","authors":["Inyoung Cheong","Aylin Caliskan","Tadayoshi Kohno"],"pdf_url":"https://arxiv.org/pdf/2308.15906v1.pdf","comment":"26 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.15885v1","updated":"2023-08-30T09:04:06Z","published":"2023-08-30T09:04:06Z","title":"Towards One-Shot Learning for Text Classification using Inductive Logic\n Programming","summary":" With the ever-increasing potential of AI to perform personalised tasks, it is\nbecoming essential to develop new machine learning techniques which are\ndata-efficient and do not require hundreds or thousands of training data. In\nthis paper, we explore an Inductive Logic Programming approach for one-shot\ntext classification. In particular, we explore the framework of\nMeta-Interpretive Learning (MIL), along with using common-sense background\nknowledge extracted from ConceptNet. Results indicate that MIL can learn text\nclassification rules from a small number of training examples. Moreover, the\nhigher complexity of chosen examples, the higher accuracy of the outcome.\n","authors":["Ghazal Afroozi Milani","Daniel Cyrus","Alireza Tamaddoni-Nezhad"],"pdf_url":"https://arxiv.org/pdf/2308.15885v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15122v2","updated":"2023-08-30T09:03:23Z","published":"2023-08-29T08:41:16Z","title":"SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge\n Distillation from BERT","summary":" Spiking neural networks (SNNs) offer a promising avenue to implement deep\nneural networks in a more energy-efficient way. However, the network\narchitectures of existing SNNs for language tasks are too simplistic, and deep\narchitectures have not been fully explored, resulting in a significant\nperformance gap compared to mainstream transformer-based networks such as BERT.\nTo this end, we improve a recently-proposed spiking transformer (i.e.,\nSpikformer) to make it possible to process language tasks and propose a\ntwo-stage knowledge distillation method for training it, which combines\npre-training by distilling knowledge from BERT with a large collection of\nunlabelled texts and fine-tuning with task-specific instances via knowledge\ndistillation again from the BERT fine-tuned on the same training examples.\nThrough extensive experimentation, we show that the models trained with our\nmethod, named SpikeBERT, outperform state-of-the-art SNNs and even achieve\ncomparable results to BERTs on text classification tasks for both English and\nChinese with much less energy consumption.\n","authors":["Changze Lv","Tianlong Li","Jianhan Xu","Chenxi Gu","Zixuan Ling","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.03780v3","updated":"2023-08-30T08:56:37Z","published":"2023-02-07T22:37:21Z","title":"Reliable Natural Language Understanding with Large Language Models and\n Answer Set Programming","summary":" Humans understand language by extracting information (meaning) from\nsentences, combining it with existing commonsense knowledge, and then\nperforming reasoning to draw conclusions. While large language models (LLMs)\nsuch as GPT-3 and ChatGPT are able to leverage patterns in the text to solve a\nvariety of NLP tasks, they fall short in problems that require reasoning. They\nalso cannot reliably explain the answers generated for a given question. In\norder to emulate humans better, we propose STAR, a framework that combines LLMs\nwith Answer Set Programming (ASP). We show how LLMs can be used to effectively\nextract knowledge -- represented as predicates -- from language. Goal-directed\nASP is then employed to reliably reason over this knowledge. We apply the STAR\nframework to three different NLU tasks requiring reasoning: qualitative\nreasoning, mathematical reasoning, and goal-directed conversation. Our\nexperiments reveal that STAR is able to bridge the gap of reasoning in NLU\ntasks, leading to significant performance improvements, especially for smaller\nLLMs, i.e., LLMs with a smaller number of parameters. NLU applications\ndeveloped using the STAR framework are also explainable: along with the\npredicates generated, a justification in the form of a proof tree can be\nproduced for a given output.\n","authors":["Abhiramon Rajasekharan","Yankai Zeng","Parth Padalkar","Gopal Gupta"],"pdf_url":"https://arxiv.org/pdf/2302.03780v3.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2211.05994v4","updated":"2023-08-30T08:02:56Z","published":"2022-11-11T04:29:02Z","title":"A Survey of Knowledge Enhanced Pre-trained Language Models","summary":" Pre-trained Language Models (PLMs) which are trained on large text corpus via\nself-supervised learning method, have yielded promising performance on various\ntasks in Natural Language Processing (NLP). However, though PLMs with huge\nparameters can effectively possess rich knowledge learned from massive training\ntext and benefit downstream tasks at the fine-tuning stage, they still have\nsome limitations such as poor reasoning ability due to the lack of external\nknowledge. Research has been dedicated to incorporating knowledge into PLMs to\ntackle these issues. In this paper, we present a comprehensive review of\nKnowledge Enhanced Pre-trained Language Models (KE-PLMs) to provide a clear\ninsight into this thriving field. We introduce appropriate taxonomies\nrespectively for Natural Language Understanding (NLU) and Natural Language\nGeneration (NLG) to highlight these two main tasks of NLP. For NLU, we divide\nthe types of knowledge into four categories: linguistic knowledge, text\nknowledge, knowledge graph (KG), and rule knowledge. The KE-PLMs for NLG are\ncategorized into KG-based and retrieval-based methods. Finally, we point out\nsome promising future directions of KE-PLMs.\n","authors":["Linmei Hu","Zeyi Liu","Ziwang Zhao","Lei Hou","Liqiang Nie","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2211.05994v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15813v1","updated":"2023-08-30T07:36:12Z","published":"2023-08-30T07:36:12Z","title":"Knowledge-grounded Natural Language Recommendation Explanation","summary":" Explanations accompanied by a recommendation can assist users in\nunderstanding the decision made by recommendation systems, which in turn\nincreases a user's confidence and trust in the system. Recently, research has\nfocused on generating natural language explanations in a human-readable format.\nThus far, the proposed approaches leverage item reviews written by users, which\nare often subjective, sparse in language, and unable to account for new items\nthat have not been purchased or reviewed before. Instead, we aim to generate\nfact-grounded recommendation explanations that are objectively described with\nitem features while implicitly considering a user's preferences, based on the\nuser's purchase history. To achieve this, we propose a knowledge graph (KG)\napproach to natural language explainable recommendation. Our approach draws on\nuser-item features through a novel collaborative filtering-based KG\nrepresentation to produce fact-grounded, personalized explanations, while\njointly learning user-item representations for recommendation scoring.\nExperimental results show that our approach consistently outperforms previous\nstate-of-the-art models on natural language explainable recommendation.\n","authors":["Anthony Colas","Jun Araki","Zhengyu Zhou","Bingqing Wang","Zhe Feng"],"pdf_url":"https://arxiv.org/pdf/2308.15813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15812v1","updated":"2023-08-30T07:35:32Z","published":"2023-08-30T07:35:32Z","title":"Peering Through Preferences: Unraveling Feedback Acquisition for\n Aligning Large Language Models","summary":" Aligning large language models (LLMs) with human values and intents\ncritically involves the use of human or AI feedback. While dense feedback\nannotations are expensive to acquire and integrate, sparse feedback presents a\nstructural design choice between ratings (e.g., score Response A on a scale of\n1-7) and rankings (e.g., is Response A better than Response B?). In this work,\nwe analyze the effect of this design choice for the alignment and evaluation of\nLLMs. We uncover an inconsistency problem wherein the preferences inferred from\nratings and rankings significantly disagree 60% for both human and AI\nannotators. Our subsequent analysis identifies various facets of annotator\nbiases that explain this phenomena, such as human annotators would rate denser\nresponses higher while preferring accuracy during pairwise judgments. To our\nsurprise, we also observe that the choice of feedback protocol also has a\nsignificant effect on the evaluation of aligned LLMs. In particular, we find\nthat LLMs that leverage rankings data for alignment (say model X) are preferred\nover those that leverage ratings data (say model Y), with a rank-based\nevaluation protocol (is X/Y's response better than reference response?) but not\nwith a rating-based evaluation protocol (score Rank X/Y's response on a scale\nof 1-7). Our findings thus shed light on critical gaps in methods for\nevaluating the real-world utility of language models and their strong\ndependence on the feedback protocol used for alignment. Our code and data are\navailable at https://github.com/Hritikbansal/sparse_feedback.\n","authors":["Hritik Bansal","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2308.15812v1.pdf","comment":"24 pages, 12 Tables, 3 Figures"},{"id":"http://arxiv.org/abs/2308.15793v1","updated":"2023-08-30T06:53:24Z","published":"2023-08-30T06:53:24Z","title":"HAlf-MAsked Model for Named Entity Sentiment analysis","summary":" Named Entity Sentiment analysis (NESA) is one of the most actively developing\napplication domains in Natural Language Processing (NLP). Social media NESA is\na significant field of opinion analysis since detecting and tracking sentiment\ntrends in the news flow is crucial for building various analytical systems and\nmonitoring the media image of specific people or companies. In this paper, we\nstudy different transformers-based solutions NESA in RuSentNE-23 evaluation.\nDespite the effectiveness of the BERT-like models, they can still struggle with\ncertain challenges, such as overfitting, which appeared to be the main obstacle\nin achieving high accuracy on the RuSentNE-23 data. We present several\napproaches to overcome this problem, among which there is a novel technique of\nadditional pass over given data with masked entity before making the final\nprediction so that we can combine logits from the model when it knows the exact\nentity it predicts sentiment for and when it does not. Utilizing this\ntechnique, we ensemble multiple BERT- like models trained on different subsets\nof data to improve overall performance. Our proposed model achieves the best\nresult on RuSentNE-23 evaluation data and demonstrates improved consistency in\nentity-level sentiment analysis.\n","authors":["Anton Kabaev","Pavel Podberezko","Andrey Kaznacheev","Sabina Abdullayeva"],"pdf_url":"https://arxiv.org/pdf/2308.15793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06566v3","updated":"2023-08-30T06:46:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15772v1","updated":"2023-08-30T05:41:29Z","published":"2023-08-30T05:41:29Z","title":"Task-Based MoE for Multitask Multilingual Machine Translation","summary":" Mixture-of-experts (MoE) architecture has been proven a powerful method for\ndiverse tasks in training deep models in many applications. However, current\nMoE implementations are task agnostic, treating all tokens from different tasks\nin the same manner. In this work, we instead design a novel method that\nincorporates task information into MoE models at different granular levels with\nshared dynamic task-based adapters. Our experiments and analysis show the\nadvantages of our approaches over the dense and canonical MoE models on\nmulti-task multilingual machine translations. With task-specific adapters, our\nmodels can additionally generalize to new tasks efficiently.\n","authors":["Hai Pham","Young Jin Kim","Subhabrata Mukherjee","David P. Woodruff","Barnabas Poczos","Hany Hassan Awadalla"],"pdf_url":"https://arxiv.org/pdf/2308.15772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15053v2","updated":"2023-08-30T04:46:19Z","published":"2023-08-29T06:27:58Z","title":"Adapting Text-based Dialogue State Tracker for Spoken Dialogues","summary":" Although there have been remarkable advances in dialogue systems through the\ndialogue systems technology competition (DSTC), it remains one of the key\nchallenges to building a robust task-oriented dialogue system with a speech\ninterface. Most of the progress has been made for text-based dialogue systems\nsince there are abundant datasets with written corpora while those with spoken\ndialogues are very scarce. However, as can be seen from voice assistant systems\nsuch as Siri and Alexa, it is of practical importance to transfer the success\nto spoken dialogues. In this paper, we describe our engineering effort in\nbuilding a highly successful model that participated in the speech-aware\ndialogue systems technology challenge track in DSTC11. Our model consists of\nthree major modules: (1) automatic speech recognition error correction to\nbridge the gap between the spoken and the text utterances, (2) text-based\ndialogue system (D3ST) for estimating the slots and values using slot\ndescriptions, and (3) post-processing for recovering the error of the estimated\nslot value. Our experiments show that it is important to use an explicit\nautomatic speech recognition error correction module, post-processing, and data\naugmentation to adapt a text-based dialogue state tracker for spoken dialogue\ncorpora.\n","authors":["Jaeseok Yoon","Seunghyun Hwang","Ran Han","Jeonguk Bang","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15053v2.pdf","comment":"8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at\n SIGDIAL 2023"},{"id":"http://arxiv.org/abs/1811.03325v5","updated":"2023-08-30T04:39:22Z","published":"2018-11-08T09:16:19Z","title":"Marshall-Olkin Power-Law Distributions in Length-Frequency of Entities","summary":" Entities involve important concepts with concrete meanings and play important\nroles in numerous linguistic tasks. Entities have different forms in different\nlinguistic tasks and researchers treat those different forms as different\nconcepts. In this paper, we are curious to know whether there are some common\ncharacteristics that connect those different forms of entities. Specifically,\nwe investigate the underlying distributions of entities from different types\nand different languages, trying to figure out some common characteristics\nbehind those diverse entities. After analyzing twelve datasets about different\ntypes of entities and eighteen datasets about entities in different languages,\nwe find that while these entities are dramatically diverse from each other in\nmany aspects, their length-frequencies can be well characterized by a family of\nMarshall-Olkin power-law (MOPL) distributions. We conduct experiments on those\nthirty datasets about entities in different types and different languages, and\nexperimental results demonstrate that MOPL models characterize the\nlength-frequencies of entities much better than two state-of-the-art power-law\nmodels and an alternative log-normal model. Experimental results also\ndemonstrate that MOPL models are scalable to the length-frequency of entities\nin large-scale real-world datasets.\n","authors":["Xiaoshi Zhong","Xiang Yu","Erik Cambria","Jagath C. Rajapakse"],"pdf_url":"https://arxiv.org/pdf/1811.03325v5.pdf","comment":"33 pages, 3 figures (30 subfigures), 8 tables. To appear in\n Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2308.15745v1","updated":"2023-08-30T03:52:28Z","published":"2023-08-30T03:52:28Z","title":"Cyberbullying Detection for Low-resource Languages and Dialects: Review\n of the State of the Art","summary":" The struggle of social media platforms to moderate content in a timely\nmanner, encourages users to abuse such platforms to spread vulgar or abusive\nlanguage, which, when performed repeatedly becomes cyberbullying a social\nproblem taking place in virtual environments, yet with real-world consequences,\nsuch as depression, withdrawal, or even suicide attempts of its victims.\nSystems for the automatic detection and mitigation of cyberbullying have been\ndeveloped but, unfortunately, the vast majority of them are for the English\nlanguage, with only a handful available for low-resource languages. To estimate\nthe present state of research and recognize the needs for further development,\nin this paper we present a comprehensive systematic survey of studies done so\nfar for automatic cyberbullying detection in low-resource languages. We\nanalyzed all studies on this topic that were available. We investigated more\nthan seventy published studies on automatic detection of cyberbullying or\nrelated language in low-resource languages and dialects that were published\nbetween around 2017 and January 2023. There are 23 low-resource languages and\ndialects covered by this paper, including Bangla, Hindi, Dravidian languages\nand others. In the survey, we identify some of the research gaps of previous\nstudies, which include the lack of reliable definitions of cyberbullying and\nits relevant subcategories, biases in the acquisition, and annotation of data.\nBased on recognizing those research gaps, we provide some suggestions for\nimproving the general research conduct in cyberbullying detection, with a\nprimary focus on low-resource languages. Based on those proposed suggestions,\nwe collect and release a cyberbullying dataset in the Chittagonian dialect of\nBangla and propose a number of initial ML solutions trained on that dataset. In\naddition, pre-trained transformer-based the BanglaBERT model was also\nattempted.\n","authors":["Tanjim Mahmud","Michal Ptaszynski","Juuso Eronen","Fumito Masui"],"pdf_url":"https://arxiv.org/pdf/2308.15745v1.pdf","comment":"52 Pages"},{"id":"http://arxiv.org/abs/2308.03188v2","updated":"2023-08-30T03:47:34Z","published":"2023-08-06T18:38:52Z","title":"Automatically Correcting Large Language Models: Surveying the landscape\n of diverse self-correction strategies","summary":" Large language models (LLMs) have demonstrated remarkable performance across\na wide array of NLP tasks. However, their efficacy is undermined by undesired\nand inconsistent behaviors, including hallucination, unfaithful reasoning, and\ntoxic content. A promising approach to rectify these flaws is self-correction,\nwhere the LLM itself is prompted or guided to fix problems in its own output.\nTechniques leveraging automated feedback -- either produced by the LLM itself\nor some external system -- are of particular interest as they are a promising\nway to make LLM-based solutions more practical and deployable with minimal\nhuman feedback. This paper presents a comprehensive review of this emerging\nclass of techniques. We analyze and taxonomize a wide array of recent work\nutilizing these strategies, including training-time, generation-time, and\npost-hoc correction. We also summarize the major applications of this strategy\nand conclude by discussing future directions and challenges.\n","authors":["Liangming Pan","Michael Saxon","Wenda Xu","Deepak Nathani","Xinyi Wang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03188v2.pdf","comment":"Work in Progress. Version 2"},{"id":"http://arxiv.org/abs/2308.05361v2","updated":"2023-08-30T03:11:56Z","published":"2023-08-10T06:08:20Z","title":"WeaverBird: Empowering Financial Decision-Making with Large Language\n Model, Knowledge Base, and Search Engine","summary":" We present WeaverBird, an intelligent dialogue system designed specifically\nfor the finance domain. Our system harnesses a large language model of GPT\narchitecture that has been tuned using extensive corpora of finance-related\ntext. As a result, our system possesses the capability to understand complex\nfinancial queries, such as \"How should I manage my investments during\ninflation?\", and provide informed responses. Furthermore, our system\nincorporates a local knowledge base and a search engine to retrieve relevant\ninformation. The final responses are conditioned on the search results and\ninclude proper citations to the sources, thus enjoying an enhanced credibility.\nThrough a range of finance-related questions, we have demonstrated the superior\nperformance of our system compared to other models. To experience our system\nfirsthand, users can interact with our live demo at\nhttps://weaverbird.ttic.edu, as well as watch our 2-min video illustration at\nhttps://www.youtube.com/watch?v=fyV2qQkX6Tc.\n","authors":["Siqiao Xue","Fan Zhou","Yi Xu","Hongyu Zhao","Shuo Xie","Qingyang Dai","Caigao Jiang","James Zhang","Jun Zhou","Dacheng Xiu","Hongyuan Mei"],"pdf_url":"https://arxiv.org/pdf/2308.05361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15727v1","updated":"2023-08-30T03:06:47Z","published":"2023-08-30T03:06:47Z","title":"Quantifying and Analyzing Entity-level Memorization in Large Language\n Models","summary":" Large language models (LLMs) have been proven capable of memorizing their\ntraining data, which can be extracted through specifically designed prompts. As\nthe scale of datasets continues to grow, privacy risks arising from\nmemorization have attracted increasing attention. Quantifying language model\nmemorization helps evaluate potential privacy risks. However, prior works on\nquantifying memorization require access to the precise original data or incur\nsubstantial computational overhead, making it difficult for applications in\nreal-world language models. To this end, we propose a fine-grained,\nentity-level definition to quantify memorization with conditions and metrics\ncloser to real-world scenarios. In addition, we also present an approach for\nefficiently extracting sensitive entities from autoregressive language models.\nWe conduct extensive experiments based on the proposed, probing language\nmodels' ability to reconstruct sensitive entities under different settings. We\nfind that language models have strong memorization at the entity level and are\nable to reproduce the training data even with partial leakages. The results\ndemonstrate that LLMs not only memorize their training data but also understand\nassociations between entities. These findings necessitate that trainers of LLMs\nexercise greater prudence regarding model memorization, adopting memorization\nmitigation techniques to preclude privacy violations.\n","authors":["Zhenhong Zhou","Jiuyang Xiang","Chaomeng Chen","Sen Su"],"pdf_url":"https://arxiv.org/pdf/2308.15727v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.15711v1","updated":"2023-08-30T02:22:40Z","published":"2023-08-30T02:22:40Z","title":"Optimizing Factual Accuracy in Text Generation through Dynamic Knowledge\n Selection","summary":" Language models (LMs) have revolutionized the way we interact with\ninformation, but they often generate nonfactual text, raising concerns about\ntheir reliability. Previous methods use external knowledge as references for\ntext generation to enhance factuality but often struggle with the knowledge\nmix-up(e.g., entity mismatch) of irrelevant references. Besides,as the length\nof the output text grows, the randomness of sampling can escalate,\ndetrimentally impacting the factual accuracy of the generated text. In this\npaper, we present DKGen, which divide the text generation process into an\niterative process. In each iteration, DKGen takes the input query, the\npreviously generated text and a subset of the reference passages as input to\ngenerate short text. During the process, the subset is dynamically selected\nfrom the full passage set based on their relevance to the previously generated\ntext and the query, largely eliminating the irrelevant references from input.\nTo further enhance DKGen's ability to correctly use these external knowledge,\nDKGen distills the relevance order of reference passages to the cross-attention\ndistribution of decoder. We train and evaluate DKGen on a large-scale benchmark\ndataset. Experiment results show that DKGen outperforms all baseline models.\n","authors":["Hongjin Qian","Zhicheng Dou","Jiejun Tan","Haonan Chen","Haoqi Gu","Ruofei Lai","Xinyu Zhang","Zhao Cao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.15711v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2304.06767v3","updated":"2023-08-30T01:25:29Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v3.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.16349v1","updated":"2023-08-30T22:50:32Z","published":"2023-08-30T22:50:32Z","title":"Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning\n Based on Visually Grounded Conversations","summary":" We introduce Affective Visual Dialog, an emotion explanation and reasoning\ntask as a testbed for research on understanding the formation of emotions in\nvisually grounded conversations. The task involves three skills: (1)\nDialog-based Question Answering (2) Dialog-based Emotion Prediction and (3)\nAffective emotion explanation generation based on the dialog. Our key\ncontribution is the collection of a large-scale dataset, dubbed AffectVisDial,\nconsisting of 50K 10-turn visually grounded dialogs as well as concluding\nemotion attributions and dialog-informed textual emotion explanations,\nresulting in a total of 27,180 working hours. We explain our design decisions\nin collecting the dataset and introduce the questioner and answerer tasks that\nare associated with the participants in the conversation. We train and\ndemonstrate solid Affective Visual Dialog baselines adapted from\nstate-of-the-art models. Remarkably, the responses generated by our models show\npromising emotional reasoning abilities in response to visually grounded\nconversations. Our project page is available at\nhttps://affective-visual-dialog.github.io.\n","authors":["Kilichbek Haydarov","Xiaoqian Shen","Avinash Madasu","Mahmoud Salem","Jia Li","Gamaleldin Elsayed","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2308.16349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16336v1","updated":"2023-08-30T21:56:36Z","published":"2023-08-30T21:56:36Z","title":"ToddlerBERTa: Exploiting BabyBERTa for Grammar Learning and Language\n Understanding","summary":" We present ToddlerBERTa, a BabyBERTa-like language model, exploring its\ncapabilities through five different models with varied hyperparameters.\nEvaluating on BLiMP, SuperGLUE, MSGS, and a Supplement benchmark from the\nBabyLM challenge, we find that smaller models can excel in specific tasks,\nwhile larger models perform well with substantial data. Despite training on a\nsmaller dataset, ToddlerBERTa demonstrates commendable performance, rivalling\nthe state-of-the-art RoBERTa-base. The model showcases robust language\nunderstanding, even with single-sentence pretraining, and competes with\nbaselines that leverage broader contextual information. Our work provides\ninsights into hyperparameter choices, and data utilization, contributing to the\nadvancement of language models.\n","authors":["Omer Veysel Cagatan"],"pdf_url":"https://arxiv.org/pdf/2308.16336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01879v4","updated":"2023-08-30T21:28:01Z","published":"2023-05-03T03:47:00Z","title":"SCOTT: Self-Consistent Chain-of-Thought Distillation","summary":" Large language models (LMs) beyond a certain scale, demonstrate the emergent\ncapability of generating free-text rationales for their predictions via\nchain-of-thought (CoT) prompting. While CoT can yield dramatically improved\nperformance, such gains are only observed for sufficiently large LMs. Even more\nconcerning, there is little guarantee that the generated rationales are\nconsistent with LM's predictions or faithfully justify the decisions. In this\nwork, we propose a faithful knowledge distillation method to learn a small,\nself-consistent CoT model from a teacher model that is orders of magnitude\nlarger. To form better supervision, we elicit rationales supporting the gold\nanswers from a large LM (teacher) by contrastive decoding, which encourages the\nteacher to generate tokens that become more plausible only when the answer is\nconsidered. To ensure faithful distillation, we use the teacher-generated\nrationales to learn a student LM with a counterfactual reasoning objective,\nwhich prevents the student from ignoring the rationales to make inconsistent\npredictions. Experiments show that, while yielding comparable end-task\nperformance, our method can generate CoT rationales that are more faithful than\nbaselines do. Further analysis suggests that such a model respects the\nrationales more when making decisions; thus, we can improve its performance\nmore by refining its rationales.\n","authors":["Peifeng Wang","Zhengyang Wang","Zheng Li","Yifan Gao","Bing Yin","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2305.01879v4.pdf","comment":"11 pages, 8 figures. Accepted to ACL 2023"},{"id":"http://arxiv.org/abs/2307.00925v5","updated":"2023-08-30T19:32:00Z","published":"2023-07-03T10:53:05Z","title":"Automatic Design of Semantic Similarity Ensembles Using Grammatical\n Evolution","summary":" Semantic similarity measures are widely used in natural language processing\nto catalyze various computer-related tasks. However, no single semantic\nsimilarity measure is the most appropriate for all tasks, and researchers often\nuse ensemble strategies to ensure performance. This research work proposes a\nmethod for automatically designing semantic similarity ensembles. In fact, our\nproposed method uses grammatical evolution, for the first time, to\nautomatically select and aggregate measures from a pool of candidates to create\nan ensemble that maximizes correlation to human judgment. The method is\nevaluated on several benchmark datasets and compared to state-of-the-art\nensembles, showing that it can significantly improve similarity assessment\naccuracy and outperform existing methods in some cases. As a result, our\nresearch demonstrates the potential of using grammatical evolution to\nautomatically compare text and prove the benefits of using ensembles for\nsemantic similarity tasks. The source code that illustrates our approach can be\ndownloaded from https://github.com/jorge-martinez-gil/sesige.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.00925v5.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2202.04053v3","updated":"2023-08-30T18:41:01Z","published":"2022-02-08T18:36:52Z","title":"DALL-Eval: Probing the Reasoning Skills and Social Biases of\n Text-to-Image Generation Models","summary":" Recently, DALL-E, a multimodal transformer language model, and its variants,\nincluding diffusion models, have shown high-quality text-to-image generation\ncapabilities. However, despite the realistic image generation results, there\nhas not been a detailed analysis of how to evaluate such models. In this work,\nwe investigate the visual reasoning capabilities and social biases of different\ntext-to-image models, covering both multimodal transformer language models and\ndiffusion models. First, we measure three visual reasoning skills: object\nrecognition, object counting, and spatial relation understanding. For this, we\npropose PaintSkills, a compositional diagnostic evaluation dataset that\nmeasures these skills. Despite the high-fidelity image generation capability, a\nlarge gap exists between the performance of recent models and the upper bound\naccuracy in object counting and spatial relation understanding skills. Second,\nwe assess the gender and skin tone biases by measuring the gender/skin tone\ndistribution of generated images across various professions and attributes. We\ndemonstrate that recent text-to-image generation models learn specific biases\nabout gender and skin tone from web image-text pairs. We hope our work will\nhelp guide future progress in improving text-to-image generation models on\nvisual reasoning skills and learning socially unbiased representations. Code\nand data: https://github.com/j-min/DallEval\n","authors":["Jaemin Cho","Abhay Zala","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2202.04053v3.pdf","comment":"ICCV 2023 (34 pages; see appendix for version changelog)"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.16187v1","updated":"2023-08-30T17:59:11Z","published":"2023-08-30T17:59:11Z","title":"Boosting Detection in Crowd Analysis via Underutilized Output Features","summary":" Detection-based methods have been viewed unfavorably in crowd analysis due to\ntheir poor performance in dense crowds. However, we argue that the potential of\nthese methods has been underestimated, as they offer crucial information for\ncrowd analysis that is often ignored. Specifically, the area size and\nconfidence score of output proposals and bounding boxes provide insight into\nthe scale and density of the crowd. To leverage these underutilized features,\nwe propose Crowd Hat, a plug-and-play module that can be easily integrated with\nexisting detection models. This module uses a mixed 2D-1D compression technique\nto refine the output features and obtain the spatial and numerical distribution\nof crowd-specific information. Based on these features, we further propose\nregion-adaptive NMS thresholds and a decouple-then-align paradigm that address\nthe major limitations of detection-based methods. Our extensive evaluations on\nvarious crowd analysis tasks, including crowd counting, localization, and\ndetection, demonstrate the effectiveness of utilizing output features and the\npotential of detection-based methods in crowd analysis.\n","authors":["Shaokai Wu","Fengyu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16187v1.pdf","comment":"project page: https://fredfyyang.github.io/Crowd-Hat/"},{"id":"http://arxiv.org/abs/2308.16184v1","updated":"2023-08-30T17:59:02Z","published":"2023-08-30T17:59:02Z","title":"SAM-Med2D","summary":" The Segment Anything Model (SAM) represents a state-of-the-art research\nadvancement in natural image segmentation, achieving impressive results with\ninput prompts such as points and bounding boxes. However, our evaluation and\nrecent research indicate that directly applying the pretrained SAM to medical\nimage segmentation does not yield satisfactory performance. This limitation\nprimarily arises from significant domain gap between natural images and medical\nimages. To bridge this gap, we introduce SAM-Med2D, the most comprehensive\nstudies on applying SAM to medical 2D images. Specifically, we first collect\nand curate approximately 4.6M images and 19.7M masks from public and private\ndatasets, constructing a large-scale medical image segmentation dataset\nencompassing various modalities and objects. Then, we comprehensively fine-tune\nSAM on this dataset and turn it into SAM-Med2D. Unlike previous methods that\nonly adopt bounding box or point prompts as interactive segmentation approach,\nwe adapt SAM to medical image segmentation through more comprehensive prompts\ninvolving bounding boxes, points, and masks. We additionally fine-tune the\nencoder and decoder of the original SAM to obtain a well-performed SAM-Med2D,\nleading to the most comprehensive fine-tuning strategies to date. Finally, we\nconducted a comprehensive evaluation and analysis to investigate the\nperformance of SAM-Med2D in medical image segmentation across various\nmodalities, anatomical structures, and organs. Concurrently, we validated the\ngeneralization capability of SAM-Med2D on 9 datasets from MICCAI 2023\nchallenge. Overall, our approach demonstrated significantly superior\nperformance and generalization capability compared to SAM.\n","authors":["Junlong Cheng","Jin Ye","Zhongying Deng","Jianpin Chen","Tianbin Li","Haoyu Wang","Yanzhou Su","Ziyan Huang","Jilong Chen","Lei Jiang","Hui Sun","Junjun He","Shaoting Zhang","Min Zhu","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2308.16184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16182v1","updated":"2023-08-30T17:58:50Z","published":"2023-08-30T17:58:50Z","title":"GREC: Generalized Referring Expression Comprehension","summary":" The objective of Classic Referring Expression Comprehension (REC) is to\nproduce a bounding box corresponding to the object mentioned in a given textual\ndescription. Commonly, existing datasets and techniques in classic REC are\ntailored for expressions that pertain to a single target, meaning a sole\nexpression is linked to one specific object. Expressions that refer to multiple\ntargets or involve no specific target have not been taken into account. This\nconstraint hinders the practical applicability of REC. This study introduces a\nnew benchmark termed as Generalized Referring Expression Comprehension (GREC).\nThis benchmark extends the classic REC by permitting expressions to describe\nany number of target objects. To achieve this goal, we have built the first\nlarge-scale GREC dataset named gRefCOCO. This dataset encompasses a range of\nexpressions: those referring to multiple targets, expressions with no specific\ntarget, and the single-target expressions. The design of GREC and gRefCOCO\nensures smooth compatibility with classic REC. The proposed gRefCOCO dataset, a\nGREC method implementation code, and GREC evaluation code are available at\nhttps://github.com/henghuiding/gRefCOCO.\n","authors":["Shuting He","Henghui Ding","Chang Liu","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.16182v1.pdf","comment":"GREC Technical Report, Project Page:\n https://henghuiding.github.io/GRES"},{"id":"http://arxiv.org/abs/2303.17590v2","updated":"2023-08-30T17:46:17Z","published":"2023-03-30T17:57:43Z","title":"Going Beyond Nouns With Vision & Language Models Using Synthetic Data","summary":" Large-scale pre-trained Vision & Language (VL) models have shown remarkable\nperformance in many applications, enabling replacing a fixed set of supported\nclasses with zero-shot open vocabulary reasoning over (almost arbitrary)\nnatural language prompts. However, recent works have uncovered a fundamental\nweakness of these models. For example, their difficulty to understand Visual\nLanguage Concepts (VLC) that go 'beyond nouns' such as the meaning of\nnon-object words (e.g., attributes, actions, relations, states, etc.), or\ndifficulty in performing compositional reasoning such as understanding the\nsignificance of the order of the words in a sentence. In this work, we\ninvestigate to which extent purely synthetic data could be leveraged to teach\nthese models to overcome such shortcomings without compromising their zero-shot\ncapabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale\nsynthetic dataset and data generation codebase allowing to generate additional\nsuitable data to improve VLC understanding and compositional reasoning of VL\nmodels. Additionally, we propose a general VL finetuning strategy for\neffectively leveraging SyViC towards achieving these improvements. Our\nextensive experiments and ablations on VL-Checklist, Winoground, and ARO\nbenchmarks demonstrate that it is possible to adapt strong pre-trained VL\nmodels with synthetic data significantly enhancing their VLC understanding\n(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their\nzero-shot accuracy.\n","authors":["Paola Cascante-Bonilla","Khaled Shehada","James Seale Smith","Sivan Doveh","Donghyun Kim","Rameswar Panda","Gül Varol","Aude Oliva","Vicente Ordonez","Rogerio Feris","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2303.17590v2.pdf","comment":"Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/"},{"id":"http://arxiv.org/abs/2308.16154v1","updated":"2023-08-30T17:20:46Z","published":"2023-08-30T17:20:46Z","title":"MMVP: Motion-Matrix-based Video Prediction","summary":" A central challenge of video prediction lies where the system has to reason\nthe objects' future motions from image frames while simultaneously maintaining\nthe consistency of their appearances across frames. This work introduces an\nend-to-end trainable two-stream video prediction framework, Motion-Matrix-based\nVideo Prediction (MMVP), to tackle this challenge. Unlike previous methods that\nusually handle motion prediction and appearance maintenance within the same set\nof modules, MMVP decouples motion and appearance information by constructing\nappearance-agnostic motion matrices. The motion matrices represent the temporal\nsimilarity of each and every pair of feature patches in the input frames, and\nare the sole input of the motion prediction module in MMVP. This design\nimproves video prediction in both accuracy and efficiency, and reduces the\nmodel size. Results of extensive experiments demonstrate that MMVP outperforms\nstate-of-the-art systems on public data sets by non-negligible large margins\n(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the\nsize or smaller). Please refer to\nhttps://github.com/Kay1794/MMVP-motion-matrix-based-video-prediction for the\nofficial code and the datasets used in this paper.\n","authors":["Yiqi Zhong","Luming Liang","Ilya Zharkov","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16154v1.pdf","comment":"ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2308.16150v1","updated":"2023-08-30T17:16:02Z","published":"2023-08-30T17:16:02Z","title":"Modality Cycles with Masked Conditional Diffusion for Unsupervised\n Anomaly Segmentation in MRI","summary":" Unsupervised anomaly segmentation aims to detect patterns that are distinct\nfrom any patterns processed during training, commonly called abnormal or\nout-of-distribution patterns, without providing any associated manual\nsegmentations. Since anomalies during deployment can lead to model failure,\ndetecting the anomaly can enhance the reliability of models, which is valuable\nin high-risk domains like medical imaging. This paper introduces Masked\nModality Cycles with Conditional Diffusion (MMCCD), a method that enables\nsegmentation of anomalies across diverse patterns in multimodal MRI. The method\nis based on two fundamental ideas. First, we propose the use of cyclic modality\ntranslation as a mechanism for enabling abnormality detection.\nImage-translation models learn tissue-specific modality mappings, which are\ncharacteristic of tissue physiology. Thus, these learned mappings fail to\ntranslate tissues or image patterns that have never been encountered during\ntraining, and the error enables their segmentation. Furthermore, we combine\nimage translation with a masked conditional diffusion model, which attempts to\n`imagine' what tissue exists under a masked area, further exposing unknown\npatterns as the generative model fails to recreate them. We evaluate our method\non a proxy task by training on healthy-looking slices of BraTS2021\nmulti-modality MRIs and testing on slices with tumors. We show that our method\ncompares favorably to previous unsupervised approaches based on image\nreconstruction and denoising with autoencoders and diffusion models.\n","authors":["Ziyun Liang","Harry Anthony","Felix Wagner","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2308.16150v1.pdf","comment":"Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI\n 2023"},{"id":"http://arxiv.org/abs/2308.01981v2","updated":"2023-08-30T17:02:55Z","published":"2023-08-03T18:28:50Z","title":"CartiMorph: a framework for automated knee articular cartilage\n morphometrics","summary":" We introduce CartiMorph, a framework for automated knee articular cartilage\nmorphometrics. It takes an image as input and generates quantitative metrics\nfor cartilage subregions, including the percentage of full-thickness cartilage\nloss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the\npower of deep learning models for hierarchical image feature representation.\nDeep learning models were trained and validated for tissue segmentation,\ntemplate construction, and template-to-image registration. We established\nmethods for surface-normal-based cartilage thickness mapping, FCL estimation,\nand rule-based cartilage parcellation. Our cartilage thickness map showed less\nerror in thin and peripheral regions. We evaluated the effectiveness of the\nadopted segmentation model by comparing the quantitative metrics obtained from\nmodel segmentation and those from manual segmentation. The root-mean-squared\ndeviation of the FCL measurements was less than 8%, and strong correlations\nwere observed for the mean thickness (Pearson's correlation coefficient $\\rho\n\\in [0.82,0.97]$), surface area ($\\rho \\in [0.82,0.98]$) and volume ($\\rho \\in\n[0.89,0.98]$) measurements. We compared our FCL measurements with those from a\nprevious study and found that our measurements deviated less from the ground\ntruths. We observed superior performance of the proposed rule-based cartilage\nparcellation method compared with the atlas-based approach. CartiMorph has the\npotential to promote imaging biomarkers discovery for knee osteoarthritis.\n","authors":["Yongcheng Yao","Junru Zhong","Liping Zhang","Sheheryar Khan","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01981v2.pdf","comment":"To be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.16145v1","updated":"2023-08-30T17:01:01Z","published":"2023-08-30T17:01:01Z","title":"CircleFormer: Circular Nuclei Detection in Whole Slide Images with\n Circle Queries and Attention","summary":" Both CNN-based and Transformer-based object detection with bounding box\nrepresentation have been extensively studied in computer vision and medical\nimage analysis, but circular object detection in medical images is still\nunderexplored. Inspired by the recent anchor free CNN-based circular object\ndetection method (CircleNet) for ball-shape glomeruli detection in renal\npathology, in this paper, we present CircleFormer, a Transformer-based circular\nmedical object detection with dynamic anchor circles. Specifically, queries\nwith circle representation in Transformer decoder iteratively refine the\ncircular object detection results, and a circle cross attention module is\nintroduced to compute the similarity between circular queries and image\nfeatures. A generalized circle IoU (gCIoU) is proposed to serve as a new\nregression loss of circular object detection as well. Moreover, our approach is\neasy to generalize to the segmentation task by adding a simple segmentation\nbranch to CircleFormer. We evaluate our method in circular nuclei detection and\nsegmentation on the public MoNuSeg dataset, and the experimental results show\nthat our method achieves promising performance compared with the\nstate-of-the-art approaches. The effectiveness of each component is validated\nvia ablation studies as well. Our code is released at:\n\\url{https://github.com/zhanghx-iim-ahu/CircleFormer}.\n","authors":["Hengxu Zhang","Pengpeng Liang","Zhiyong Sun","Bo Song","Erkang Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.16145v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.16139v1","updated":"2023-08-30T16:52:20Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2303.17783v3","updated":"2023-08-30T16:25:02Z","published":"2023-03-31T03:14:44Z","title":"Uncertainty-Aware Source-Free Adaptive Image Super-Resolution with\n Wavelet Augmentation Transformer","summary":" Unsupervised Domain Adaptation (UDA) can effectively address domain gap\nissues in real-world image Super-Resolution (SR) by accessing both the source\nand target data. Considering privacy policies or transmission restrictions of\nsource data in practical scenarios, we propose a SOurce-free Domain Adaptation\nframework for image SR (SODA-SR) to address this issue, i.e., adapt a\nsource-trained model to a target domain with only unlabeled target data.\nSODA-SR leverages the source-trained model to generate refined pseudo-labels\nfor teacher-student learning. To better utilize pseudo-labels, we propose a\nnovel wavelet-based augmentation method, named Wavelet Augmentation Transformer\n(WAT), which can be flexibly incorporated with existing networks, to implicitly\nproduce useful augmented data. WAT learns low-frequency information of varying\nlevels across diverse samples, which is aggregated efficiently via deformable\nattention. Furthermore, an uncertainty-aware self-training mechanism is\nproposed to improve the accuracy of pseudo-labels, with inaccurate predictions\nbeing rectified by uncertainty estimation. To acquire better SR results and\navoid overfitting pseudo-labels, several regularization losses are proposed to\nconstrain target LR and SR images in the frequency domain. Experiments show\nthat without accessing source data, SODA-SR outperforms state-of-the-art UDA\nmethods in both synthetic$\\rightarrow$real and real$\\rightarrow$real adaptation\nsettings, and is not constrained by specific network architectures.\n","authors":["Yuang Ai","Xiaoqiang Zhou","Huaibo Huang","Lei Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2303.17783v3.pdf","comment":"9 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2306.00914v2","updated":"2023-08-30T16:24:15Z","published":"2023-06-01T17:16:37Z","title":"Conditioning Diffusion Models via Attributes and Semantic Masks for Face\n Generation","summary":" Deep generative models have shown impressive results in generating realistic\nimages of faces. GANs managed to generate high-quality, high-fidelity images\nwhen conditioned on semantic masks, but they still lack the ability to\ndiversify their output. Diffusion models partially solve this problem and are\nable to generate diverse samples given the same condition. In this paper, we\npropose a multi-conditioning approach for diffusion models via cross-attention\nexploiting both attributes and semantic masks to generate high-quality and\ncontrollable face images. We also studied the impact of applying\nperceptual-focused loss weighting into the latent space instead of the pixel\nspace. Our method extends the previous approaches by introducing conditioning\non more than one set of features, guaranteeing a more fine-grained control over\nthe generated face images. We evaluate our approach on the CelebA-HQ dataset,\nand we show that it can generate realistic and diverse samples while allowing\nfor fine-grained control over multiple attributes and semantic regions.\nAdditionally, we perform an ablation study to evaluate the impact of different\nconditioning strategies on the quality and diversity of the generated images.\n","authors":["Nico Giambi","Giuseppe Lisanti"],"pdf_url":"https://arxiv.org/pdf/2306.00914v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16126v1","updated":"2023-08-30T16:23:07Z","published":"2023-08-30T16:23:07Z","title":"CorrEmbed: Evaluating Pre-trained Model Image Similarity Efficacy with a\n Novel Metric","summary":" Detecting visually similar images is a particularly useful attribute to look\nto when calculating product recommendations. Embedding similarity, which\nutilizes pre-trained computer vision models to extract high-level image\nfeatures, has demonstrated remarkable efficacy in identifying images with\nsimilar compositions. However, there is a lack of methods for evaluating the\nembeddings generated by these models, as conventional loss and performance\nmetrics do not adequately capture their performance in image similarity search\ntasks.\n In this paper, we evaluate the viability of the image embeddings from\nnumerous pre-trained computer vision models using a novel approach named\nCorrEmbed. Our approach computes the correlation between distances in image\nembeddings and distances in human-generated tag vectors. We extensively\nevaluate numerous pre-trained Torchvision models using this metric, revealing\nan intuitive relationship of linear scaling between ImageNet1k accuracy scores\nand tag-correlation scores. Importantly, our method also identifies deviations\nfrom this pattern, providing insights into how different models capture\nhigh-level image features.\n By offering a robust performance evaluation of these pre-trained models,\nCorrEmbed serves as a valuable tool for researchers and practitioners seeking\nto develop effective, data-driven approaches to similar item recommendations in\nfashion retail.\n","authors":["Karl Audun Kagnes Borgersen","Morten Goodwin","Jivitesh Sharma","Tobias Aasmoe","Mari Leonhardsen","Gro Herredsvela Rørvik"],"pdf_url":"https://arxiv.org/pdf/2308.16126v1.pdf","comment":"Accepted to AI-2023 Forty-third SGAI International Conference on\n Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.16110v1","updated":"2023-08-30T16:10:21Z","published":"2023-08-30T16:10:21Z","title":"Improving Few-shot Image Generation by Structural Discrimination and\n Textural Modulation","summary":" Few-shot image generation, which aims to produce plausible and diverse images\nfor one category given a few images from this category, has drawn extensive\nattention. Existing approaches either globally interpolate different images or\nfuse local representations with pre-defined coefficients. However, such an\nintuitive combination of images/features only exploits the most relevant\ninformation for generation, leading to poor diversity and coarse-grained\nsemantic fusion. To remedy this, this paper proposes a novel textural\nmodulation (TexMod) mechanism to inject external semantic signals into internal\nlocal representations. Parameterized by the feedback from the discriminator,\nour TexMod enables more fined-grained semantic injection while maintaining the\nsynthesis fidelity. Moreover, a global structural discriminator (StructD) is\ndeveloped to explicitly guide the model to generate images with reasonable\nlayout and outline. Furthermore, the frequency awareness of the model is\nreinforced by encouraging the model to distinguish frequency signals. Together\nwith these techniques, we build a novel and effective model for few-shot image\ngeneration. The effectiveness of our model is identified by extensive\nexperiments on three popular datasets and various settings. Besides achieving\nstate-of-the-art synthesis performance on these datasets, our proposed\ntechniques could be seamlessly integrated into existing models for a further\nperformance boost.\n","authors":["Mengping Yang","Zhe Wang","Wenyi Feng","Qian Zhang","Ting Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.16110v1.pdf","comment":"To appear in ACM MM 2023, code is available at\n https://github.com/kobeshegu/SDTM-GAN-ACMMM-2023"},{"id":"http://arxiv.org/abs/2305.11582v2","updated":"2023-08-30T16:06:27Z","published":"2023-05-19T10:43:57Z","title":"What You Hear Is What You See: Audio Quality Metrics From Image Quality\n Metrics","summary":" In this study, we investigate the feasibility of utilizing state-of-the-art\nimage perceptual metrics for evaluating audio signals by representing them as\nspectrograms. The encouraging outcome of the proposed approach is based on the\nsimilarity between the neural mechanisms in the auditory and visual pathways.\nFurthermore, we customise one of the metrics which has a psychoacoustically\nplausible architecture to account for the peculiarities of sound signals. We\nevaluate the effectiveness of our proposed metric and several baseline metrics\nusing a music dataset, with promising results in terms of the correlation\nbetween the metrics and the perceived quality of audio as rated by human\nevaluators.\n","authors":["Tashi Namgyal","Alexander Hepburn","Raul Santos-Rodriguez","Valero Laparra","Jesus Malo"],"pdf_url":"https://arxiv.org/pdf/2305.11582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15745v2","updated":"2023-08-30T15:58:56Z","published":"2023-07-28T18:01:08Z","title":"Context-VQA: Towards Context-Aware and Purposeful Visual Question\n Answering","summary":" Visual question answering (VQA) has the potential to make the Internet more\naccessible in an interactive way, allowing people who cannot see images to ask\nquestions about them. However, multiple studies have shown that people who are\nblind or have low-vision prefer image explanations that incorporate the context\nin which an image appears, yet current VQA datasets focus on images in\nisolation. We argue that VQA models will not fully succeed at meeting people's\nneeds unless they take context into account. To further motivate and analyze\nthe distinction between different contexts, we introduce Context-VQA, a VQA\ndataset that pairs images with contexts, specifically types of websites (e.g.,\na shopping website). We find that the types of questions vary systematically\nacross contexts. For example, images presented in a travel context garner 2\ntimes more \"Where?\" questions, and images on social media and news garner 2.8\nand 1.8 times more \"Who?\" questions than the average. We also find that context\neffects are especially important when participants can't see the image. These\nresults demonstrate that context affects the types of questions asked and that\nVQA models should be context-sensitive to better meet people's needs,\nespecially in accessibility settings.\n","authors":["Nandita Naik","Christopher Potts","Elisa Kreiss"],"pdf_url":"https://arxiv.org/pdf/2307.15745v2.pdf","comment":"Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision\n and Language"},{"id":"http://arxiv.org/abs/2308.14480v2","updated":"2023-08-30T15:33:01Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.16083v1","updated":"2023-08-30T15:15:31Z","published":"2023-08-30T15:15:31Z","title":"Learned Image Reasoning Prior Penetrates Deep Unfolding Network for\n Panchromatic and Multi-Spectral Image Fusion","summary":" The success of deep neural networks for pan-sharpening is commonly in a form\nof black box, lacking transparency and interpretability. To alleviate this\nissue, we propose a novel model-driven deep unfolding framework with image\nreasoning prior tailored for the pan-sharpening task. Different from existing\nunfolding solutions that deliver the proximal operator networks as the\nuncertain and vague priors, our framework is motivated by the content reasoning\nability of masked autoencoders (MAE) with insightful designs. Specifically, the\npre-trained MAE with spatial masking strategy, acting as intrinsic reasoning\nprior, is embedded into unfolding architecture. Meanwhile, the pre-trained MAE\nwith spatial-spectral masking strategy is treated as the regularization term\nwithin loss function to constrain the spatial-spectral consistency. Such\ndesigns penetrate the image reasoning prior into deep unfolding networks while\nimproving its interpretability and representation capability. The uniqueness of\nour framework is that the holistic learning process is explicitly integrated\nwith the inherent physical mechanism underlying the pan-sharpening task.\nExtensive experiments on multiple satellite datasets demonstrate the\nsuperiority of our method over the existing state-of-the-art approaches. Code\nwill be released at \\url{https://manman1995.github.io/}.\n","authors":["Man Zhou","Jie Huang","Naishan Zheng","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.16083v1.pdf","comment":"10 pages; Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16082v1","updated":"2023-08-30T15:14:56Z","published":"2023-08-30T15:14:56Z","title":"SignDiff: Learning Diffusion Models for American Sign Language\n Production","summary":" The field of Sign Language Production (SLP) lacked a large-scale, pre-trained\nmodel based on deep learning for continuous American Sign Language (ASL)\nproduction in the past decade. This limitation hampers communication for all\nindividuals with disabilities relying on ASL. To address this issue, we\nundertook the secondary development and utilization of How2Sign, one of the\nlargest publicly available ASL datasets. Despite its significance, prior\nresearchers in the field of sign language have not effectively employed this\ncorpus due to the intricacies involved in American Sign Language Production\n(ASLP).\n To conduct large-scale ASLP, we propose SignDiff based on the latest work in\nrelated fields, which is a dual-condition diffusion pre-training model that can\ngenerate human sign language speakers from a skeleton pose. SignDiff has a\nnovel Frame Reinforcement Network called FR-Net, similar to dense human pose\nestimation work, which enhances the correspondence between text lexical symbols\nand sign language dense pose frames reduce the occurrence of multiple fingers\nin the diffusion model. In addition, our ASLP method proposes two new improved\nmodules and a new loss function to improve the accuracy and quality of sign\nlanguage skeletal posture and enhance the ability of the model to train on\nlarge-scale data.\n We propose the first baseline for ASL production and report the scores of\n17.19 and 12.85 on BLEU-4 on the How2Sign dev/test sets. We also evaluated our\nmodel on the previous mainstream dataset called PHOENIX14T, and the main\nexperiments achieved the results of SOTA. In addition, our image quality far\nexceeds all previous results by 10 percentage points on the SSIM indicator.\nFinally, we conducted ablation studies and qualitative evaluations for\ndiscussion.\n","authors":["Sen Fang","Chunyu Sui","Xuedong Zhang","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2308.16082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16075v1","updated":"2023-08-30T14:52:14Z","published":"2023-08-30T14:52:14Z","title":"Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for\n English to Indian Languages","summary":" The study investigates the effectiveness of utilizing multimodal information\nin Neural Machine Translation (NMT). While prior research focused on using\nmultimodal data in low-resource scenarios, this study examines how image\nfeatures impact translation when added to a large-scale, pre-trained unimodal\nNMT system. Surprisingly, the study finds that images might be redundant in\nthis context. Additionally, the research introduces synthetic noise to assess\nwhether images help the model deal with textual noise. Multimodal models\nslightly outperform text-only models in noisy settings, even with random\nimages. The study's experiments translate from English to Hindi, Bengali, and\nMalayalam, outperforming state-of-the-art benchmarks significantly.\nInterestingly, the effect of visual context varies with source text noise: no\nvisual context works best for non-noisy translations, cropped image features\nare optimal for low noise, and full image features work better in high-noise\nscenarios. This sheds light on the role of visual context, especially in noisy\nsettings, opening up a new research direction for Noisy Neural Machine\nTranslation in multimodal setups. The research emphasizes the importance of\ncombining visual and textual information for improved translation in various\nenvironments.\n","authors":["Baban Gain","Dibyanayan Bandyopadhyay","Samrat Mukherjee","Chandranath Adak","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2308.16075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16071v1","updated":"2023-08-30T14:49:34Z","published":"2023-08-30T14:49:34Z","title":"Semantic Image Synthesis via Class-Adaptive Cross-Attention","summary":" In semantic image synthesis, the state of the art is dominated by methods\nthat use spatially-adaptive normalization layers, which allow for excellent\nvisual generation quality and editing versatility. Granted their efficacy,\nrecent research efforts have focused toward finer-grained local style control\nand multi-modal generation. By construction though, such layers tend to\noverlook global image statistics leading to unconvincing local style editing\nand causing global inconsistencies such as color or illumination distribution\nshifts. Also, the semantic layout is required for mapping styles in the\ngenerator, putting a strict alignment constraint over the features. In\nresponse, we designed a novel architecture where cross-attention layers are\nused in place of de-normalization ones for conditioning the image generation.\nOur model inherits the advantages of both solutions, retaining state-of-the-art\nreconstruction quality, as well as improved global and local style transfer.\nCode and models available at https://github.com/TFonta/CA2SIS.\n","authors":["Tomaso Fontanini","Claudio Ferrari","Giuseppe Lisanti","Massimo Bertozzi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2308.16071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.07752v2","updated":"2023-08-30T14:39:24Z","published":"2021-07-16T08:07:22Z","title":"NeXtQSM -- A complete deep learning pipeline for data-consistent\n quantitative susceptibility mapping trained with hybrid data","summary":" Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great\npotential in recent years, obtaining similar results to established\nnon-learning approaches. Many current deep learning approaches are not data\nconsistent, require in vivo training data or solve the QSM problem in\nconsecutive steps resulting in the propagation of errors. Here we aim to\novercome these limitations and developed a framework to solve the QSM\nprocessing steps jointly. We developed a new hybrid training data generation\nmethod that enables the end-to-end training for solving background field\ncorrection and dipole inversion in a data-consistent fashion using a\nvariational network that combines the QSM model term and a learned regularizer.\nWe demonstrate that NeXtQSM overcomes the limitations of previous deep learning\nmethods. NeXtQSM offers a new deep learning based pipeline for computing\nquantitative susceptibility maps that integrates each processing step into the\ntraining and provides results that are robust and fast.\n","authors":["Francesco Cognolato","Kieran O'Brien","Jin Jin","Simon Robinson","Frederik B. Laun","Markus Barth","Steffen Bollmann"],"pdf_url":"https://arxiv.org/pdf/2107.07752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08637v2","updated":"2023-08-30T14:28:37Z","published":"2023-06-14T17:07:51Z","title":"TAPIR: Tracking Any Point with per-frame Initialization and temporal\n Refinement","summary":" We present a novel model for Tracking Any Point (TAP) that effectively tracks\nany queried point on any physical surface throughout a video sequence. Our\napproach employs two stages: (1) a matching stage, which independently locates\na suitable candidate point match for the query point on every other frame, and\n(2) a refinement stage, which updates both the trajectory and query features\nbased on local correlations. The resulting model surpasses all baseline methods\nby a significant margin on the TAP-Vid benchmark, as demonstrated by an\napproximate 20% absolute average Jaccard (AJ) improvement on DAVIS. Our model\nfacilitates fast inference on long and high-resolution video sequences. On a\nmodern GPU, our implementation has the capacity to track points faster than\nreal-time, and can be flexibly extended to higher-resolution videos. Given the\nhigh-quality trajectories extracted from a large dataset, we demonstrate a\nproof-of-concept diffusion model which generates trajectories from static\nimages, enabling plausible animations. Visualizations, source code, and\npretrained models can be found on our project webpage.\n","authors":["Carl Doersch","Yi Yang","Mel Vecerik","Dilara Gokay","Ankush Gupta","Yusuf Aytar","Joao Carreira","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2306.08637v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2210.05152v5","updated":"2023-08-30T14:24:46Z","published":"2022-10-11T05:11:41Z","title":"TriangleNet: Edge Prior Augmented Network for Semantic Segmentation\n through Cross-Task Consistency","summary":" This paper addresses the task of semantic segmentation in computer vision,\naiming to achieve precise pixel-wise classification. We investigate the joint\ntraining of models for semantic edge detection and semantic segmentation, which\nhas shown promise. However, implicit cross-task consistency learning in\nmulti-task networks is limited. To address this, we propose a novel \"decoupled\ncross-task consistency loss\" that explicitly enhances cross-task consistency.\nOur semantic segmentation network, TriangleNet, achieves a substantial 2.88\\%\nimprovement over the Baseline in mean Intersection over Union (mIoU) on the\nCityscapes test set. Notably, TriangleNet operates at 77.4\\% mIoU/46.2 FPS on\nCityscapes, showcasing real-time inference capabilities at full resolution.\nWith multi-scale inference, performance is further enhanced to 77.8\\%.\nFurthermore, TriangleNet consistently outperforms the Baseline on the FloodNet\ndataset, demonstrating its robust generalization capabilities. The proposed\nmethod underscores the significance of multi-task learning and explicit\ncross-task consistency enhancement for advancing semantic segmentation and\nhighlights the potential of multitasking in real-time semantic segmentation.\n","authors":["Dan Zhang","Rui Zheng","Luosang Gadeng","Pei Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05152v5.pdf","comment":"Accepted for publication in the journal \"International Journal of\n Intelligent Systems\""},{"id":"http://arxiv.org/abs/2302.14416v3","updated":"2023-08-30T14:22:32Z","published":"2023-02-28T08:48:45Z","title":"DREAM: Efficient Dataset Distillation by Representative Matching","summary":" Dataset distillation aims to synthesize small datasets with little\ninformation loss from original large-scale ones for reducing storage and\ntraining costs. Recent state-of-the-art methods mainly constrain the sample\nsynthesis process by matching synthetic images and the original ones regarding\ngradients, embedding distributions, or training trajectories. Although there\nare various matching objectives, currently the strategy for selecting original\nimages is limited to naive random sampling.\n We argue that random sampling overlooks the evenness of the selected sample\ndistribution, which may result in noisy or biased matching targets.\n Besides, the sample diversity is also not constrained by random sampling.\nThese factors together lead to optimization instability in the distilling\nprocess and degrade the training efficiency. Accordingly, we propose a novel\nmatching strategy named as \\textbf{D}ataset distillation by\n\\textbf{RE}present\\textbf{A}tive \\textbf{M}atching (DREAM), where only\nrepresentative original images are selected for matching. DREAM is able to be\neasily plugged into popular dataset distillation frameworks and reduce the\ndistilling iterations by more than 8 times without performance drop. Given\nsufficient training time, DREAM further provides significant improvements and\nachieves state-of-the-art performances.\n","authors":["Yanqing Liu","Jianyang Gu","Kai Wang","Zheng Zhu","Wei Jiang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2302.14416v3.pdf","comment":"Efficient matching for dataset distillation"},{"id":"http://arxiv.org/abs/2308.14500v2","updated":"2023-08-30T14:18:58Z","published":"2023-08-28T11:20:48Z","title":"LAC -- Latent Action Composition for Skeleton-based Action Segmentation","summary":" Skeleton-based action segmentation requires recognizing composable actions in\nuntrimmed videos. Current approaches decouple this problem by first extracting\nlocal visual features from skeleton sequences and then processing them by a\ntemporal model to classify frame-wise actions. However, their performances\nremain limited as the visual features cannot sufficiently express composable\nactions. In this context, we propose Latent Action Composition (LAC), a novel\nself-supervised framework aiming at learning from synthesized composable\nmotions for skeleton-based action segmentation. LAC is composed of a novel\ngeneration module towards synthesizing new sequences. Specifically, we design a\nlinear latent space in the generator to represent primitive motion. New\ncomposed motions can be synthesized by simply performing arithmetic operations\non latent representations of multiple input skeleton sequences. LAC leverages\nsuch synthesized sequences, which have large diversity and complexity, for\nlearning visual representations of skeletons in both sequence and frame spaces\nvia contrastive learning. The resulting visual encoder has a high expressive\npower and can be effectively transferred onto action segmentation tasks by\nend-to-end fine-tuning without the need for additional temporal models. We\nconduct a study focusing on transfer-learning and we show that representations\nlearned from pre-trained LAC outperform the state-of-the-art by a large margin\non TSU, Charades, PKU-MMD datasets.\n","authors":["Di Yang","Yaohui Wang","Antitza Dantcheva","Quan Kong","Lorenzo Garattoni","Gianpiero Francesca","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2308.14500v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.12247v2","updated":"2023-08-30T14:09:13Z","published":"2023-03-22T01:01:14Z","title":"Exploring the Benefits of Visual Prompting in Differential Privacy","summary":" Visual Prompting (VP) is an emerging and powerful technique that allows\nsample-efficient adaptation to downstream tasks by engineering a well-trained\nfrozen source model. In this work, we explore the benefits of VP in\nconstructing compelling neural network classifiers with differential privacy\n(DP). We explore and integrate VP into canonical DP training methods and\ndemonstrate its simplicity and efficiency. In particular, we discover that VP\nin tandem with PATE, a state-of-the-art DP training method that leverages the\nknowledge transfer from an ensemble of teachers, achieves the state-of-the-art\nprivacy-utility trade-off with minimum expenditure of privacy budget. Moreover,\nwe conduct additional experiments on cross-domain image classification with a\nsufficient domain gap to further unveil the advantage of VP in DP. Lastly, we\nalso conduct extensive ablation studies to validate the effectiveness and\ncontribution of VP under DP consideration. Our code is available at\n(https://github.com/EzzzLi/Prompt-PATE).\n","authors":["Yizhe Li","Yu-Lin Tsai","Xuebin Ren","Chia-Mu Yu","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12247v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.06208v3","updated":"2023-08-30T14:07:49Z","published":"2023-06-05T23:07:01Z","title":"DeltaNN: Assessing the Impact of Computational Environment Parameters on\n the Performance of Image Recognition Models","summary":" Image recognition tasks typically use deep learning and require enormous\nprocessing power, thus relying on hardware accelerators like GPUs and TPUs for\nfast, timely processing. Failure in real-time image recognition tasks can occur\ndue to sub-optimal mapping on hardware accelerators during model deployment,\nwhich may lead to timing uncertainty and erroneous behavior. Mapping on\nhardware accelerators is done using multiple software components like deep\nlearning frameworks, compilers, and device libraries, that we refer to as the\ncomputational environment. Owing to the increased use of image recognition\ntasks in safety-critical applications like autonomous driving and medical\nimaging, it is imperative to assess their robustness to changes in the\ncomputational environment, as the impact of parameters like deep learning\nframeworks, compiler optimizations, and hardware devices on model performance\nand correctness is not yet well understood.\n In this paper we present a differential testing framework, DeltaNN, that\nallows us to assess the impact of different computational environment\nparameters on the performance of image recognition models during deployment,\npost training. DeltaNN generates different implementations of a given image\nrecognition model for variations in environment parameters, namely, deep\nlearning frameworks, compiler optimizations and hardware devices and analyzes\ndifferences in model performance as a result. Using DeltaNN, we conduct an\nempirical study of robustness analysis of three popular image recognition\nmodels using the ImageNet dataset. We report the impact in terms of\nmisclassifications and inference time differences across different settings. In\ntotal, we observed up to 72% output label differences across deep learning\nframeworks, and up to 81% unexpected performance degradation in terms of\ninference time, when applying compiler optimizations.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06208v3.pdf","comment":"11 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2305.08854v2","updated":"2023-08-30T14:01:36Z","published":"2023-05-15T17:59:57Z","title":"Laughing Matters: Introducing Laughing-Face Generation using Diffusion\n Models","summary":" Speech-driven animation has gained significant traction in recent years, with\ncurrent methods achieving near-photorealistic results. However, the field\nremains underexplored regarding non-verbal communication despite evidence\ndemonstrating its importance in human interaction. In particular, generating\nlaughter sequences presents a unique challenge due to the intricacy and nuances\nof this behaviour. This paper aims to bridge this gap by proposing a novel\nmodel capable of generating realistic laughter sequences, given a still\nportrait and an audio clip containing laughter. We highlight the failure cases\nof traditional facial animation methods and leverage recent advances in\ndiffusion models to produce convincing laughter videos. We train our model on a\ndiverse set of laughter datasets and introduce an evaluation metric\nspecifically designed for laughter. When compared with previous speech-driven\napproaches, our model achieves state-of-the-art performance across all metrics,\neven when these are re-trained for laughter generation. Our code and project\nare publicly available\n","authors":["Antoni Bigata Casademunt","Rodrigo Mira","Nikita Drobyshev","Konstantinos Vougioukas","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2305.08854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16041v1","updated":"2023-08-30T14:00:48Z","published":"2023-08-30T14:00:48Z","title":"From Pixels to Portraits: A Comprehensive Survey of Talking Head\n Generation Techniques and Applications","summary":" Recent advancements in deep learning and computer vision have led to a surge\nof interest in generating realistic talking heads. This paper presents a\ncomprehensive survey of state-of-the-art methods for talking head generation.\nWe systematically categorises them into four main approaches: image-driven,\naudio-driven, video-driven and others (including neural radiance fields (NeRF),\nand 3D-based methods). We provide an in-depth analysis of each method,\nhighlighting their unique contributions, strengths, and limitations.\nFurthermore, we thoroughly compare publicly available models, evaluating them\non key aspects such as inference time and human-rated quality of the generated\noutputs. Our aim is to provide a clear and concise overview of the current\nlandscape in talking head generation, elucidating the relationships between\ndifferent approaches and identifying promising directions for future research.\nThis survey will serve as a valuable reference for researchers and\npractitioners interested in this rapidly evolving field.\n","authors":["Shreyank N Gowda","Dheeraj Pandey","Shashank Narayana Gowda"],"pdf_url":"https://arxiv.org/pdf/2308.16041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10611v2","updated":"2023-08-30T13:50:25Z","published":"2023-01-25T14:45:13Z","title":"Discriminator-free Unsupervised Domain Adaptation for Multi-label Image\n Classification","summary":" In this paper, a discriminator-free adversarial-based Unsupervised Domain\nAdaptation (UDA) for Multi-Label Image Classification (MLIC) referred to as\nDDA-MLIC is proposed. Recently, some attempts have been made for introducing\nadversarial-based UDA methods in the context of MLIC. However, these methods\nwhich rely on an additional discriminator subnet present one major shortcoming.\nThe learning of domain-invariant features may harm their task-specific\ndiscriminative power, since the classification and discrimination tasks are\ndecoupled. Herein, we propose to overcome this issue by introducing a novel\nadversarial critic that is directly deduced from the task-specific classifier.\nSpecifically, a two-component Gaussian Mixture Model (GMM) is fitted on the\nsource and target predictions in order to distinguish between two clusters.\nThis allows extracting a Gaussian distribution for each component. The\nresulting Gaussian distributions are then used for formulating an adversarial\nloss based on a Frechet distance. The proposed method is evaluated on several\nmulti-label image datasets covering three different types of domain shift. The\nobtained results demonstrate that DDA-MLIC outperforms existing\nstate-of-the-art methods in terms of precision while requiring a lower number\nof parameters. The code will be made publicly available online.\n","authors":["Indel Pal Singh","Enjie Ghorbel","Anis Kacem","Arunkumar Rathinam","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2301.10611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06157v3","updated":"2023-08-30T13:41:23Z","published":"2023-06-10T23:50:02Z","title":"Fault Localization for Buggy Deep Learning Framework Conversions in\n Image Recognition","summary":" When deploying Deep Neural Networks (DNNs), developers often convert models\nfrom one deep learning framework to another (e.g., TensorFlow to PyTorch).\nHowever, this process is error-prone and can impact target model accuracy. To\nidentify the extent of such impact, we perform and briefly present a\ndifferential analysis against three DNNs widely used for image recognition\n(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep\nlearning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which\nrevealed numerous model crashes and output label discrepancies of up to 72%. To\nmitigate such errors, we present a novel approach towards fault localization\nand repair of buggy deep learning framework conversions, focusing on\npre-trained image recognition models. Our technique consists of four stages of\nanalysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters,\nand 4) graph representation. In addition, we propose various strategies towards\nfault repair of the faults detected. We implement our technique on top of the\nApache TVM deep learning compiler, and we test it by conducting a preliminary\nfault localization analysis for the conversion of InceptionV3 from TF to\nTFLite. Our approach detected a fault in a common DNN converter tool, which\nintroduced precision errors in weights, reducing model accuracy. After our\nfault localization, we repaired the issue, reducing our conversion error to\nzero.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06157v3.pdf","comment":"5 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.14074v2","updated":"2023-08-30T13:40:21Z","published":"2023-08-27T11:37:26Z","title":"Nonrigid Object Contact Estimation With Regional Unwrapping Transformer","summary":" Acquiring contact patterns between hands and nonrigid objects is a common\nconcern in the vision and robotics community. However, existing learning-based\nmethods focus more on contact with rigid ones from monocular images. When\nadopting them for nonrigid contact, a major problem is that the existing\ncontact representation is restricted by the geometry of the object.\nConsequently, contact neighborhoods are stored in an unordered manner and\ncontact features are difficult to align with image cues. At the core of our\napproach lies a novel hand-object contact representation called RUPs (Region\nUnwrapping Profiles), which unwrap the roughly estimated hand-object surfaces\nas multiple high-resolution 2D regional profiles. The region grouping strategy\nis consistent with the hand kinematic bone division because they are the\nprimitive initiators for a composite contact pattern. Based on this\nrepresentation, our Regional Unwrapping Transformer (RUFormer) learns the\ncorrelation priors across regions from monocular inputs and predicts\ncorresponding contact and deformed transformations. Our experiments demonstrate\nthat the proposed framework can robustly estimate the deformed degrees and\ndeformed transformations, which makes it suitable for both nonrigid and rigid\ncontact.\n","authors":["Wei Xie","Zimeng Zhao","Shiying Li","Binghui Zuo","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14074v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.15016v2","updated":"2023-08-30T13:33:59Z","published":"2023-07-27T17:19:32Z","title":"How Good is Google Bard's Visual Understanding? An Empirical Study on\n Open Challenges","summary":" Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in\nthe field of conversational AI. Notably, Bard has recently been updated to\nhandle visual inputs alongside text prompts during conversations. Given Bard's\nimpressive track record in handling textual inputs, we explore its capabilities\nin understanding and interpreting visual data (images) conditioned by text\nquestions. This exploration holds the potential to unveil new insights and\nchallenges for Bard and other forthcoming multi-modal Generative models,\nespecially in addressing complex computer vision problems that demand accurate\nvisual and language understanding. Specifically, in this study, we focus on 15\ndiverse task scenarios encompassing regular, camouflaged, medical, under-water\nand remote sensing data to comprehensively evaluate Bard's performance. Our\nprimary finding indicates that Bard still struggles in these vision scenarios,\nhighlighting the significant gap in vision-based understanding that needs to be\nbridged in future developments. We expect that this empirical study will prove\nvaluable in advancing future models, leading to enhanced capabilities in\ncomprehending and interpreting fine-grained visual data. Our project is\nreleased on https://github.com/htqin/GoogleBard-VisUnderstand\n","authors":["Haotong Qin","Ge-Peng Ji","Salman Khan","Deng-Ping Fan","Fahad Shahbaz Khan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.15016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14036v2","updated":"2023-08-30T13:27:35Z","published":"2023-08-27T08:10:23Z","title":"MB-TaylorFormer: Multi-branch Efficient Transformer Expanded by Taylor\n Formula for Image Dehazing","summary":" In recent years, Transformer networks are beginning to replace pure\nconvolutional neural networks (CNNs) in the field of computer vision due to\ntheir global receptive field and adaptability to input. However, the quadratic\ncomputational complexity of softmax-attention limits the wide application in\nimage dehazing task, especially for high-resolution images. To address this\nissue, we propose a new Transformer variant, which applies the Taylor expansion\nto approximate the softmax-attention and achieves linear computational\ncomplexity. A multi-scale attention refinement module is proposed as a\ncomplement to correct the error of the Taylor expansion. Furthermore, we\nintroduce a multi-branch architecture with multi-scale patch embedding to the\nproposed Transformer, which embeds features by overlapping deformable\nconvolution of different scales. The design of multi-scale patch embedding is\nbased on three key ideas: 1) various sizes of the receptive field; 2)\nmulti-level semantic information; 3) flexible shapes of the receptive field.\nOur model, named Multi-branch Transformer expanded by Taylor formula\n(MB-TaylorFormer), can embed coarse to fine features more flexibly at the patch\nembedding stage and capture long-distance pixel interactions with limited\ncomputational cost. Experimental results on several dehazing benchmarks show\nthat MB-TaylorFormer achieves state-of-the-art (SOTA) performance with a light\ncomputational burden. The source code and pre-trained models are available at\nhttps://github.com/FVL2020/ICCV-2023-MB-TaylorFormer.\n","authors":["Yuwei Qiu","Kaihao Zhang","Chenxi Wang","Wenhan Luo","Hongdong Li","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2308.14036v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16018v1","updated":"2023-08-30T13:20:54Z","published":"2023-08-30T13:20:54Z","title":"Topology-aware MLP for Skeleton-based Action Recognition","summary":" Graph convolution networks (GCNs) have achieved remarkable performance in\nskeleton-based action recognition. However, existing previous GCN-based methods\nhave relied excessively on elaborate human body priors and constructed complex\nfeature aggregation mechanisms, which limits the generalizability of networks.\nTo solve these problems, we propose a novel Spatial Topology Gating Unit\n(STGU), which is an MLP-based variant without extra priors, to capture the\nco-occurrence topology features that encode the spatial dependency across all\njoints. In STGU, to model the sample-specific and completely independent\npoint-wise topology attention, a new gate-based feature interaction mechanism\nis introduced to activate the features point-to-point by the attention map\ngenerated from the input. Based on the STGU, in this work, we propose the first\ntopology-aware MLP-based model, Ta-MLP, for skeleton-based action recognition.\nIn comparison with existing previous methods on three large-scale datasets,\nTa-MLP achieves competitive performance. In addition, Ta-MLP reduces the\nparameters by up to 62.5% with favorable results. Compared with previous\nstate-of-the-art (SOAT) approaches, Ta-MLP pushes the frontier of real-time\naction recognition. The code will be available at\nhttps://github.com/BUPTSJZhang/Ta-MLP.\n","authors":["Shaojie Zhang","Jianqin Yin","Yonghao Dang","Jiajun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06289v2","updated":"2023-08-30T13:01:39Z","published":"2023-06-09T22:29:56Z","title":"SegViTv2: Exploring Efficient and Continual Semantic Segmentation with\n Plain Vision Transformers","summary":" This paper investigates the capability of plain Vision Transformers (ViTs)\nfor semantic segmentation using the encoder-decoder framework and introduces\n\\textbf{SegViTv2}. In this study, we introduce a novel Attention-to-Mask (\\atm)\nmodule to design a lightweight decoder effective for plain ViT. The proposed\nATM converts the global attention map into semantic masks for high-quality\nsegmentation results. Our decoder outperforms the popular decoder UPerNet using\nvarious ViT backbones while consuming only about $5\\%$ of the computational\ncost. For the encoder, we address the concern of the relatively high\ncomputational cost in the ViT-based encoders and propose a \\emph{Shrunk++}\nstructure that incorporates edge-aware query-based down-sampling (EQD) and\nquery-based upsampling (QU) modules. The Shrunk++ structure reduces the\ncomputational cost of the encoder by up to $50\\%$ while maintaining competitive\nperformance. Furthermore, we propose to adapt SegViT for continual semantic\nsegmentation, demonstrating nearly zero forgetting of previously learned\nknowledge. Experiments show that our proposed SegViTv2 surpasses recent\nsegmentation methods on three popular benchmarks including ADE20k,\nCOCO-Stuff-10k and PASCAL-Context datasets. The code is available through the\nfollowing link: \\url{https://github.com/zbwxp/SegVit}.\n","authors":["Bowen Zhang","Liyang Liu","Minh Hieu Phan","Zhi Tian","Chunhua Shen","Yifan Liu"],"pdf_url":"https://arxiv.org/pdf/2306.06289v2.pdf","comment":"IJCV 2023 accepted, 21 pages, 8 figures, 12 tables"},{"id":"http://arxiv.org/abs/2305.14730v2","updated":"2023-08-30T13:00:09Z","published":"2023-05-24T05:06:59Z","title":"BinaryViT: Towards Efficient and Accurate Binary Vision Transformers","summary":" Vision Transformers (ViTs) have emerged as the fundamental architecture for\nmost computer vision fields, but the considerable memory and computation costs\nhinders their application on resource-limited devices. As one of the most\npowerful compression methods, binarization reduces the computation of the\nneural network by quantizing the weights and activation values as $\\pm$1.\nAlthough existing binarization methods have demonstrated excellent performance\non Convolutional Neural Networks (CNNs), the full binarization of ViTs is still\nunder-studied and suffering a significant performance drop. In this paper, we\nfirst argue empirically that the severe performance degradation is mainly\ncaused by the weight oscillation in the binarization training and the\ninformation distortion in the activation of ViTs. Based on these analyses, we\npropose $\\textbf{BinaryViT}$, an accurate full binarization scheme for ViTs,\nwhich pushes the quantization of ViTs to the limit. Specifically, we propose a\nnovel gradient regularization scheme (GRS) for driving a bimodal distribution\nof the weights to reduce oscillation in binarization training. Moreover, we\ndesign an activation shift module (ASM) to adaptively tune the activation\ndistribution to reduce the information distortion caused by binarization.\nExtensive experiments on ImageNet dataset show that our BinaryViT consistently\nsurpasses the strong baseline by 2.05% and improve the accuracy of fully\nbinarized ViTs to a usable level. Furthermore, our method achieves impressive\nsavings of 16.2$\\times$ and 17.7$\\times$ in model size and OPs compared to the\nfull-precision DeiT-S.\n","authors":["Junrui Xiao","Zhikai Li","Lianwei Yang","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2305.14730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05593v3","updated":"2023-08-30T12:39:29Z","published":"2022-08-10T23:50:01Z","title":"Evaluating the Quality and Diversity of DCGAN-based Generatively\n Synthesized Diabetic Retinopathy Imagery","summary":" Publicly available diabetic retinopathy (DR) datasets are imbalanced,\ncontaining limited numbers of images with DR. This imbalance contributes to\noverfitting when training machine learning classifiers. The impact of this\nimbalance is exacerbated as the severity of the DR stage increases, affecting\nthe classifiers' diagnostic capacity. The imbalance can be addressed using\nGenerative Adversarial Networks (GANs) to augment the datasets with synthetic\nimages. Generating synthetic images is advantageous if high-quality and\ndiversified images are produced. To evaluate the quality and diversity of\nsynthetic images, several evaluation metrics, such as Multi-Scale Structural\nSimilarity Index (MS-SSIM), Cosine Distance (CD), and Fr\\'echet Inception\nDistance (FID) are used. Understanding the effectiveness of each metric in\nevaluating the quality and diversity of GAN-based synthetic images is critical\nto select images for augmentation. To date, there has been limited analysis of\nthe appropriateness of these metrics in the context of biomedical imagery. This\nwork contributes an empirical assessment of these evaluation metrics as applied\nto synthetic Proliferative DR imagery generated by a Deep Convolutional GAN\n(DCGAN). Furthermore, the metrics' capacity to indicate the quality and\ndiversity of synthetic images and a correlation with classifier performance is\nundertaken. This enables a quantitative selection of synthetic imagery and an\ninformed augmentation strategy. Results indicate that FID is suitable for\nevaluating the quality, while MS-SSIM and CD are suitable for evaluating the\ndiversity of synthetic imagery. Furthermore, the superior performance of\nConvolutional Neural Network (CNN) and EfficientNet classifiers, as indicated\nby the F1 and AUC scores, for the augmented datasets demonstrates the efficacy\nof synthetic imagery to augment the imbalanced dataset.\n","authors":["Cristina-Madalina Dragan","Muhammad Muneeb Saad","Mubashir Husain Rehmani","Ruairi O'Reilly"],"pdf_url":"https://arxiv.org/pdf/2208.05593v3.pdf","comment":"29 Pages, 8 Figures, submitted to MEDAL23: Advances in Deep\n Generative Models for Medical Artificial Intelligence (Springer Nature\n series)"},{"id":"http://arxiv.org/abs/2308.15996v1","updated":"2023-08-30T12:37:03Z","published":"2023-08-30T12:37:03Z","title":"DTrOCR: Decoder-only Transformer for Optical Character Recognition","summary":" Typical text recognition methods rely on an encoder-decoder structure, in\nwhich the encoder extracts features from an image, and the decoder produces\nrecognized text from these features. In this study, we propose a simpler and\nmore effective method for text recognition, known as the Decoder-only\nTransformer for Optical Character Recognition (DTrOCR). This method uses a\ndecoder-only Transformer to take advantage of a generative language model that\nis pre-trained on a large corpus. We examined whether a generative language\nmodel that has been successful in natural language processing can also be\neffective for text recognition in computer vision. Our experiments demonstrated\nthat DTrOCR outperforms current state-of-the-art methods by a large margin in\nthe recognition of printed, handwritten, and scene text in both English and\nChinese.\n","authors":["Masato Fujitake"],"pdf_url":"https://arxiv.org/pdf/2308.15996v1.pdf","comment":"Accepted to WACV2024"},{"id":"http://arxiv.org/abs/2308.15989v1","updated":"2023-08-30T12:19:35Z","published":"2023-08-30T12:19:35Z","title":"DiffuVolume: Diffusion Model for Volume based Stereo Matching","summary":" Stereo matching is a significant part in many computer vision tasks and\ndriving-based applications. Recently cost volume-based methods have achieved\ngreat success benefiting from the rich geometry information in paired images.\nHowever, the redundancy of cost volume also interferes with the model training\nand limits the performance. To construct a more precise cost volume, we\npioneeringly apply the diffusion model to stereo matching. Our method, termed\nDiffuVolume, considers the diffusion model as a cost volume filter, which will\nrecurrently remove the redundant information from the cost volume. Two main\ndesigns make our method not trivial. Firstly, to make the diffusion model more\nadaptive to stereo matching, we eschew the traditional manner of directly\nadding noise into the image but embed the diffusion model into a task-specific\nmodule. In this way, we outperform the traditional diffusion stereo matching\nmethod by 22% EPE improvement and 240 times inference acceleration. Secondly,\nDiffuVolume can be easily embedded into any volume-based stereo matching\nnetwork with boost performance but slight parameters rise (only 2%). By adding\nthe DiffuVolume into well-performed methods, we outperform all the published\nmethods on Scene Flow, KITTI2012, KITTI2015 benchmarks and zero-shot\ngeneralization setting. It is worth mentioning that the proposed model ranks\n1st on KITTI 2012 leader board, 2nd on KITTI 2015 leader board since 15, July\n2023.\n","authors":["Dian Zheng","Xiao-Ming Wu","Zuhao Liu","Jingke Meng","Wei-shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.15989v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.15984v1","updated":"2023-08-30T12:13:13Z","published":"2023-08-30T12:13:13Z","title":"Learning Structure-from-Motion with Graph Attention Networks","summary":" In this paper we tackle the problem of learning Structure-from-Motion (SfM)\nthrough the use of graph attention networks. SfM is a classic computer vision\nproblem that is solved though iterative minimization of reprojection errors,\nreferred to as Bundle Adjustment (BA), starting from a good initialization. In\norder to obtain a good enough initialization to BA, conventional methods rely\non a sequence of sub-problems (such as pairwise pose estimation, pose averaging\nor triangulation) which provides an initial solution that can then be refined\nusing BA. In this work we replace these sub-problems by learning a model that\ntakes as input the 2D keypoints detected across multiple views, and outputs the\ncorresponding camera poses and 3D keypoint coordinates. Our model takes\nadvantage of graph neural networks to learn SfM-specific primitives, and we\nshow that it can be used for fast inference of the reconstruction for new and\nunseen sequences. The experimental results show that the proposed model\noutperforms competing learning-based methods, and challenges COLMAP while\nhaving lower runtime.\n","authors":["Lucas Brynte","José Pedro Iglesias","Carl Olsson","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2308.15984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15975v1","updated":"2023-08-30T11:57:04Z","published":"2023-08-30T11:57:04Z","title":"RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation","summary":" For robots to be useful outside labs and specialized factories we need a way\nto teach them new useful behaviors quickly. Current approaches lack either the\ngenerality to onboard new tasks without task-specific engineering, or else lack\nthe data-efficiency to do so in an amount of time that enables practical use.\nIn this work we explore dense tracking as a representational vehicle to allow\nfaster and more general learning from demonstration. Our approach utilizes\nTrack-Any-Point (TAP) models to isolate the relevant motion in a demonstration,\nand parameterize a low-level controller to reproduce this motion across changes\nin the scene configuration. We show this results in robust robot policies that\ncan solve complex object-arrangement tasks such as shape-matching, stacking,\nand even full path-following tasks such as applying glue and sticking objects\ntogether, all from demonstrations that can be collected in minutes.\n","authors":["Mel Vecerik","Carl Doersch","Yi Yang","Todor Davchev","Yusuf Aytar","Guangyao Zhou","Raia Hadsell","Lourdes Agapito","Jon Scholz"],"pdf_url":"https://arxiv.org/pdf/2308.15975v1.pdf","comment":"Project website: https://robotap.github.io"},{"id":"http://arxiv.org/abs/2308.02562v2","updated":"2023-08-30T11:47:05Z","published":"2023-08-03T04:03:46Z","title":"Food Classification using Joint Representation of Visual and Textual\n Data","summary":" Food classification is an important task in health care. In this work, we\npropose a multimodal classification framework that uses the modified version of\nEfficientNet with the Mish activation function for image classification, and\nthe traditional BERT transformer-based network is used for text classification.\nThe proposed network and the other state-of-the-art methods are evaluated on a\nlarge open-source dataset, UPMC Food-101. The experimental results show that\nthe proposed network outperforms the other methods, a significant difference of\n11.57% and 6.34% in accuracy is observed for image and text classification,\nrespectively, when compared with the second-best performing method. We also\ncompared the performance in terms of accuracy, precision, and recall for text\nclassification using both machine learning and deep learning-based models. The\ncomparative analysis from the prediction results of both images and text\ndemonstrated the efficiency and robustness of the proposed approach.\n","authors":["Prateek Mittal","Puneet Goyal","Joohi Chauhan"],"pdf_url":"https://arxiv.org/pdf/2308.02562v2.pdf","comment":"Updated results and discussions to be posted and some sections needed\n to be expanded"},{"id":"http://arxiv.org/abs/2308.15966v1","updated":"2023-08-30T11:42:54Z","published":"2023-08-30T11:42:54Z","title":"SHARP Challenge 2023: Solving CAD History and pArameters Recovery from\n Point clouds and 3D scans. Overview, Datasets, Metrics, and Baselines","summary":" Recent breakthroughs in geometric Deep Learning (DL) and the availability of\nlarge Computer-Aided Design (CAD) datasets have advanced the research on\nlearning CAD modeling processes and relating them to real objects. In this\ncontext, 3D reverse engineering of CAD models from 3D scans is considered to be\none of the most sought-after goals for the CAD industry. However, recent\nefforts assume multiple simplifications limiting the applications in real-world\nsettings. The SHARP Challenge 2023 aims at pushing the research a step closer\nto the real-world scenario of CAD reverse engineering through dedicated\ndatasets and tracks. In this paper, we define the proposed SHARP 2023 tracks,\ndescribe the provided datasets, and propose a set of baseline methods along\nwith suitable evaluation metrics to assess the performance of the track\nsolutions. All proposed datasets along with useful routines and the evaluation\nmetrics are publicly available.\n","authors":["Dimitrios Mallis","Sk Aziz Ali","Elona Dupont","Kseniya Cherenkova","Ahmet Serdar Karadeniz","Mohammad Sadil Khan","Anis Kacem","Gleb Gusev","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2308.15966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15961v1","updated":"2023-08-30T11:35:21Z","published":"2023-08-30T11:35:21Z","title":"Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting","summary":" The task of radiology reporting comprises describing and interpreting the\nmedical findings in radiographic images, including description of their\nlocation and appearance. Automated approaches to radiology reporting require\nthe image to be encoded into a suitable token representation for input to the\nlanguage model. Previous methods commonly use convolutional neural networks to\nencode an image into a series of image-level feature map representations.\nHowever, the generated reports often exhibit realistic style but imperfect\naccuracy. Inspired by recent works for image captioning in the general domain\nin which each visual token corresponds to an object detected in an image, we\ninvestigate whether using local tokens corresponding to anatomical structures\ncan improve the quality of the generated reports. We introduce a novel\nadaptation of Faster R-CNN in which finding detection is performed for the\ncandidate bounding boxes extracted during anatomical structure localisation. We\nuse the resulting bounding box feature representations as our set of\nfinding-aware anatomical tokens. This encourages the extracted anatomical\ntokens to be informative about the findings they contain (required for the\nfinal task of radiology reporting). Evaluating on the MIMIC-CXR dataset of\nchest X-Ray images, we show that task-aware anatomical tokens give\nstate-of-the-art performance when integrated into an automated reporting\npipeline, yielding generated reports with improved clinical accuracy.\n","authors":["Francesco Dalla Serra","Chaoyang Wang","Fani Deligianni","Jeffrey Dalton","Alison Q. O'Neil"],"pdf_url":"https://arxiv.org/pdf/2308.15961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15960v1","updated":"2023-08-30T11:33:07Z","published":"2023-08-30T11:33:07Z","title":"Fusing Pseudo Labels with Weak Supervision for Dynamic Traffic Scenarios","summary":" Advanced Driver Assistance Systems (ADAS) have made significant strides,\ncapitalizing on computer vision to enhance perception and decision-making\ncapabilities. Nonetheless, the adaptation of these systems to diverse traffic\nscenarios poses challenges due to shifts in data distribution stemming from\nfactors such as location, weather, and road infrastructure. To tackle this, we\nintroduce a weakly-supervised label unification pipeline that amalgamates\npseudo labels from a multitude of object detection models trained on\nheterogeneous datasets. Our pipeline engenders a unified label space through\nthe amalgamation of labels from disparate datasets, rectifying bias and\nenhancing generalization. We fine-tune multiple object detection models on\nindividual datasets, subsequently crafting a unified dataset featuring pseudo\nlabels, meticulously validated for precision. Following this, we retrain a\nsolitary object detection model using the merged label space, culminating in a\nresilient model proficient in dynamic traffic scenarios. We put forth a\ncomprehensive evaluation of our approach, employing diverse datasets\noriginating from varied Asian countries, effectively demonstrating its efficacy\nin challenging road conditions. Notably, our method yields substantial\nenhancements in object detection performance, culminating in a model with\nheightened resistance against domain shifts.\n","authors":["Harshith Mohan Kumar","Sean Lawrence"],"pdf_url":"https://arxiv.org/pdf/2308.15960v1.pdf","comment":"This work was accepted as an extended abstract at the International\n Conference on Computer Vision (ICCV) 2023 BRAVO Workshop, Paris, France"},{"id":"http://arxiv.org/abs/2209.15376v3","updated":"2023-08-30T11:04:14Z","published":"2022-09-30T11:09:54Z","title":"NBV-SC: Next Best View Planning based on Shape Completion for Fruit\n Mapping and Reconstruction","summary":" Active perception for fruit mapping and harvesting is a difficult task since\nocclusions occur frequently and the location as well as size of fruits change\nover time. State-of-the-art viewpoint planning approaches utilize\ncomputationally expensive ray casting operations to find good viewpoints aiming\nat maximizing information gain and covering the fruits in the scene. In this\npaper, we present a novel viewpoint planning approach that explicitly uses\ninformation about the predicted fruit shapes to compute targeted viewpoints\nthat observe as yet unobserved parts of the fruits. Furthermore, we formulate\nthe concept of viewpoint dissimilarity to reduce the sampling space for more\nefficient selection of useful, dissimilar viewpoints. Our simulation\nexperiments with a UR5e arm equipped with an RGB-D sensor provide a\nquantitative demonstration of the efficacy of our iterative next best view\nplanning method based on shape completion. In comparative experiments with a\nstate-of-the-art viewpoint planner, we demonstrate improvement not only in the\nestimation of the fruit sizes, but also in their reconstruction, while\nsignificantly reducing the planning time. Finally, we show the viability of our\napproach for mapping sweet peppers plants with a real robotic system in a\ncommercial glasshouse.\n","authors":["Rohit Menon","Tobias Zaenker","Nils Dengler","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2209.15376v3.pdf","comment":"Agricultural Automation, Viewpoint Planning, Active Perception, Shape\n Completion"},{"id":"http://arxiv.org/abs/2308.15949v1","updated":"2023-08-30T10:57:41Z","published":"2023-08-30T10:57:41Z","title":"Latency-aware Unified Dynamic Networks for Efficient Image Recognition","summary":" Dynamic computation has emerged as a promising avenue to enhance the\ninference efficiency of deep networks. It allows selective activation of\ncomputational units, leading to a reduction in unnecessary computations for\neach input sample. However, the actual efficiency of these dynamic models can\ndeviate from theoretical predictions. This mismatch arises from: 1) the lack of\na unified approach due to fragmented research; 2) the focus on algorithm design\nover critical scheduling strategies, especially in CUDA-enabled GPU contexts;\nand 3) challenges in measuring practical latency, given that most libraries\ncater to static operations. Addressing these issues, we unveil the\nLatency-Aware Unified Dynamic Networks (LAUDNet), a framework that integrates\nthree primary dynamic paradigms-spatially adaptive computation, dynamic layer\nskipping, and dynamic channel skipping. To bridge the theoretical and practical\nefficiency gap, LAUDNet merges algorithmic design with scheduling optimization,\nguided by a latency predictor that accurately gauges dynamic operator latency.\nWe've tested LAUDNet across multiple vision tasks, demonstrating its capacity\nto notably reduce the latency of models like ResNet-101 by over 50% on\nplatforms such as V100, RTX3090, and TX2 GPUs. Notably, LAUDNet stands out in\nbalancing accuracy and efficiency. Code is available at:\nhttps://www.github.com/LeapLabTHU/LAUDNet.\n","authors":["Yizeng Han","Zeyu Liu","Zhihang Yuan","Yifan Pu","Chaofei Wang","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15942v1","updated":"2023-08-30T10:48:53Z","published":"2023-08-30T10:48:53Z","title":"Stage-by-stage Wavelet Optimization Refinement Diffusion Model for\n Sparse-View CT Reconstruction","summary":" Diffusion models have emerged as potential tools to tackle the challenge of\nsparse-view CT reconstruction, displaying superior performance compared to\nconventional methods. Nevertheless, these prevailing diffusion models\npredominantly focus on the sinogram or image domains, which can lead to\ninstability during model training, potentially culminating in convergence\ntowards local minimal solutions. The wavelet trans-form serves to disentangle\nimage contents and features into distinct frequency-component bands at varying\nscales, adeptly capturing diverse directional structures. Employing the Wavelet\ntransform as a guiding sparsity prior significantly enhances the robustness of\ndiffusion models. In this study, we present an innovative approach named the\nStage-by-stage Wavelet Optimization Refinement Diffusion (SWORD) model for\nsparse-view CT reconstruction. Specifically, we establish a unified\nmathematical model integrating low-frequency and high-frequency generative\nmodels, achieving the solution with optimization procedure. Furthermore, we\nperform the low-frequency and high-frequency generative models on wavelet's\ndecomposed components rather than sinogram or image domains, ensuring the\nstability of model training. Our method rooted in established optimization\ntheory, comprising three distinct stages, including low-frequency generation,\nhigh-frequency refinement and domain transform. Our experimental results\ndemonstrate that the proposed method outperforms existing state-of-the-art\nmethods both quantitatively and qualitatively.\n","authors":["Kai Xu","Shiyu Lu","Bin Huang","Weiwen Wu","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16098v5","updated":"2023-08-30T10:48:12Z","published":"2022-11-29T11:17:34Z","title":"Three-stage binarization of color document images based on discrete\n wavelet transform and generative adversarial networks","summary":" The efficient segmentation of foreground text information from the background\nin degraded color document images is a critical challenge in the preservation\nof ancient manuscripts. The imperfect preservation of ancient manuscripts over\ntime has led to various types of degradation, such as staining, yellowing, and\nink seepage, significantly affecting image binarization results. This work\nproposes a three-stage method using Generative Adversarial Networks (GAN) for\nenhancing and binarizing degraded color document images through Discrete\nWavelet Transform (DWT). Stage-1 involves applying DWT and retaining the\nLow-Low (LL) subband images for image enhancement. In Stage-2, the original\ninput image is divided into four single-channel images (Red, Green, Blue, and\nGray), and each is trained with independent adversarial networks to extract\ncolor foreground information. In Stage-3, the output image from Stage-2 and the\noriginal input image are used to train independent adversarial networks for\ndocument binarization, enabling the integration of global and local features.\nThe experimental results demonstrate that our proposed method outperforms other\nclassic and state-of-the-art (SOTA) methods on the Document Image Binarization\nContest (DIBCO) datasets. We have released our implementation code at\nhttps://github.com/abcpp12383/ThreeStageBinarization.\n","authors":["Rui-Yang Ju","Yu-Shian Lin","Chih-Chia Chen","Chun-Tse Chien","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2211.16098v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15428v2","updated":"2023-08-30T10:38:41Z","published":"2023-07-28T09:26:00Z","title":"Implicit neural representation for change detection","summary":" Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained\nduring two distinct time periods over the same geographic region presents a\nsignificant challenge due to the disparities in spatial coverage and the\npresence of noise in the acquisition system. The most commonly used approaches\nto detecting changes in point clouds are based on supervised methods which\nnecessitate extensive labelled data often unavailable in real-world\napplications. To address these issues, we propose an unsupervised approach that\ncomprises two components: Implicit Neural Representation (INR) for continuous\nshape reconstruction and a Gaussian Mixture Model for categorising changes. INR\noffers a grid-agnostic representation for encoding bi-temporal point clouds,\nwith unmatched spatial support that can be regularised to enhance\nhigh-frequency details and reduce noise. The reconstructions at each timestamp\nare compared at arbitrary spatial scales, leading to a significant increase in\ndetection capabilities. We apply our method to a benchmark dataset comprising\nsimulated LiDAR point clouds for urban sprawling. This dataset encompasses\ndiverse challenging scenarios, varying in resolutions, input modalities and\nnoise levels. This enables a comprehensive multi-scenario evaluation, comparing\nour method with the current state-of-the-art approach. We outperform the\nprevious methods by a margin of 10% in the intersection over union metric. In\naddition, we put our techniques to practical use by applying them in a\nreal-world scenario to identify instances of illicit excavation of\narchaeological sites and validate our results by comparing them with findings\nfrom field experts.\n","authors":["Peter Naylor","Diego Di Carlo","Arianna Traviglia","Makoto Yamada","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.15428v2.pdf","comment":"Main article is 10 pages + 6 pages of supplementary. Conference style\n paper"},{"id":"http://arxiv.org/abs/2308.15939v1","updated":"2023-08-30T10:35:36Z","published":"2023-08-30T10:35:36Z","title":"AnoVL: Adapting Vision-Language Models for Unified Zero-shot Anomaly\n Localization","summary":" Contrastive Language-Image Pre-training (CLIP) models have shown promising\nperformance on zero-shot visual recognition tasks by learning visual\nrepresentations under natural language supervision. Recent studies attempt the\nuse of CLIP to tackle zero-shot anomaly detection by matching images with\nnormal and abnormal state prompts. However, since CLIP focuses on building\ncorrespondence between paired text prompts and global image-level\nrepresentations, the lack of patch-level vision to text alignment limits its\ncapability on precise visual anomaly localization. In this work, we introduce a\ntraining-free adaptation (TFA) framework of CLIP for zero-shot anomaly\nlocalization. In the visual encoder, we innovate a training-free value-wise\nattention mechanism to extract intrinsic local tokens of CLIP for patch-level\nlocal description. From the perspective of text supervision, we particularly\ndesign a unified domain-aware contrastive state prompting template. On top of\nthe proposed TFA, we further introduce a test-time adaptation (TTA) mechanism\nto refine anomaly localization results, where a layer of trainable parameters\nin the adapter is optimized using TFA's pseudo-labels and synthetic\nnoise-corrupted tokens. With both TFA and TTA adaptation, we significantly\nexploit the potential of CLIP for zero-shot anomaly localization and\ndemonstrate the effectiveness of our proposed methods on various datasets.\n","authors":["Hanqiu Deng","Zhaoxiang Zhang","Jinan Bao","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2308.15939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15932v1","updated":"2023-08-30T10:21:57Z","published":"2023-08-30T10:21:57Z","title":"Attention-based CT Scan Interpolation for Lesion Segmentation of\n Colorectal Liver Metastases","summary":" Small liver lesions common to colorectal liver metastases (CRLMs) are\nchallenging for convolutional neural network (CNN) segmentation models,\nespecially when we have a wide range of slice thicknesses in the computed\ntomography (CT) scans. Slice thickness of CT images may vary by clinical\nindication. For example, thinner slices are used for presurgical planning when\nfine anatomic details of small vessels are required. While keeping the\neffective radiation dose in patients as low as possible, various slice\nthicknesses are employed in CRLMs due to their limitations. However,\ndifferences in slice thickness across CTs lead to significant performance\ndegradation in CT segmentation models based on CNNs. This paper proposes a\nnovel unsupervised attention-based interpolation model to generate intermediate\nslices from consecutive triplet slices in CT scans. We integrate segmentation\nloss during the interpolation model's training to leverage segmentation labels\nin existing slices to generate middle ones. Unlike common interpolation\ntechniques in CT volumes, our model highlights the regions of interest (liver\nand lesions) inside the abdominal CT scans in the interpolated slice. Moreover,\nour model's outputs are consistent with the original input slices while\nincreasing the segmentation performance in two cutting-edge 3D segmentation\npipelines. We tested the proposed model on the CRLM dataset to upsample\nsubjects with thick slices and create isotropic volume for our segmentation\nmodel. The produced isotropic dataset increases the Dice score in the\nsegmentation of lesions and outperforms other interpolation approaches in terms\nof interpolation metrics.\n","authors":["Mohammad Hamghalam","Richard K. G. Do","Amber L. Simpson"],"pdf_url":"https://arxiv.org/pdf/2308.15932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09829v2","updated":"2023-08-30T10:19:02Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n shortcut perspective","summary":" Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v2.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.15918v1","updated":"2023-08-30T09:45:14Z","published":"2023-08-30T09:45:14Z","title":"Physics-Informed DeepMRI: Bridging the Gap from Heat Diffusion to\n k-Space Interpolation","summary":" In the field of parallel imaging (PI), alongside image-domain regularization\nmethods, substantial research has been dedicated to exploring $k$-space\ninterpolation. However, the interpretability of these methods remains an\nunresolved issue. Furthermore, these approaches currently face acceleration\nlimitations that are comparable to those experienced by image-domain methods.\nIn order to enhance interpretability and overcome the acceleration limitations,\nthis paper introduces an interpretable framework that unifies both $k$-space\ninterpolation techniques and image-domain methods, grounded in the physical\nprinciples of heat diffusion equations. Building upon this foundational\nframework, a novel $k$-space interpolation method is proposed. Specifically, we\nmodel the process of high-frequency information attenuation in $k$-space as a\nheat diffusion equation, while the effort to reconstruct high-frequency\ninformation from low-frequency regions can be conceptualized as a reverse heat\nequation. However, solving the reverse heat equation poses a challenging\ninverse problem. To tackle this challenge, we modify the heat equation to align\nwith the principles of magnetic resonance PI physics and employ the score-based\ngenerative method to precisely execute the modified reverse heat diffusion.\nFinally, experimental validation conducted on publicly available datasets\ndemonstrates the superiority of the proposed approach over traditional\n$k$-space interpolation methods, deep learning-based $k$-space interpolation\nmethods, and conventional diffusion models in terms of reconstruction accuracy,\nparticularly in high-frequency regions.\n","authors":["Zhuo-Xu Cui","Congcong Liu","Xiaohong Fan","Chentao Cao","Jing Cheng","Qingyong Zhu","Yuanyuan Liu","Sen Jia","Yihang Zhou","Haifeng Wang","Yanjie Zhu","Jianping Zhang","Qiegen Liu","Dong Liang"],"pdf_url":"https://arxiv.org/pdf/2308.15918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14027v2","updated":"2023-08-30T09:26:11Z","published":"2023-03-24T14:37:07Z","title":"Poincaré ResNet","summary":" This paper introduces an end-to-end residual network that operates entirely\non the Poincar\\'e ball model of hyperbolic space. Hyperbolic learning has\nrecently shown great potential for visual understanding, but is currently only\nperformed in the penultimate layer(s) of deep networks. All visual\nrepresentations are still learned through standard Euclidean networks. In this\npaper we investigate how to learn hyperbolic representations of visual data\ndirectly from the pixel-level. We propose Poincar\\'e ResNet, a hyperbolic\ncounterpart of the celebrated residual network, starting from Poincar\\'e 2D\nconvolutions up to Poincar\\'e residual connections. We identify three\nroadblocks for training convolutional networks entirely in hyperbolic space and\npropose a solution for each: (i) Current hyperbolic network initializations\ncollapse to the origin, limiting their applicability in deeper networks. We\nprovide an identity-based initialization that preserves norms over many layers.\n(ii) Residual networks rely heavily on batch normalization, which comes with\nexpensive Fr\\'echet mean calculations in hyperbolic space. We introduce\nPoincar\\'e midpoint batch normalization as a faster and equally effective\nalternative. (iii) Due to the many intermediate operations in Poincar\\'e\nlayers, we lastly find that the computation graphs of deep learning libraries\nblow up, limiting our ability to train on deep hyperbolic networks. We provide\nmanual backward derivations of core hyperbolic operations to maintain\nmanageable computation graphs.\n","authors":["Max van Spengler","Erwin Berkhout","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2303.14027v2.pdf","comment":"International Conference on Computer Vision 2023"},{"id":"http://arxiv.org/abs/2308.15887v1","updated":"2023-08-30T09:04:24Z","published":"2023-08-30T09:04:24Z","title":"On the Potential of CLIP for Compositional Logical Reasoning","summary":" In this paper we explore the possibility of using OpenAI's CLIP to perform\nlogically coherent grounded visual reasoning. To that end, we formalize our\nterms and give a geometric analysis of how embeddings in CLIP's latent space\nwould need to be configured in order for the system to be logically coherent.\nOur main conclusion is that, as usually configured, CLIP cannot perform such\nreasoning.\n","authors":["Justin Brody"],"pdf_url":"https://arxiv.org/pdf/2308.15887v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15881v1","updated":"2023-08-30T09:03:28Z","published":"2023-08-30T09:03:28Z","title":"Interpretability-guided Data Augmentation for Robust Segmentation in\n Multi-centre Colonoscopy Data","summary":" Multi-centre colonoscopy images from various medical centres exhibit distinct\ncomplicating factors and overlays that impact the image content, contingent on\nthe specific acquisition centre. Existing Deep Segmentation networks struggle\nto achieve adequate generalizability in such data sets, and the currently\navailable data augmentation methods do not effectively address these sources of\ndata variability. As a solution, we introduce an innovative data augmentation\napproach centred on interpretability saliency maps, aimed at enhancing the\ngeneralizability of Deep Learning models within the realm of multi-centre\ncolonoscopy image segmentation. The proposed augmentation technique\ndemonstrates increased robustness across different segmentation models and\ndomains. Thorough testing on a publicly available multi-centre dataset for\npolyp detection demonstrates the effectiveness and versatility of our approach,\nwhich is observed both in quantitative and qualitative results. The code is\npublicly available at:\nhttps://github.com/nki-radiology/interpretability_augmentation\n","authors":["Valentina Corbetta","Regina Beets-Tan","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2308.15881v1.pdf","comment":"10 pages, 4 figures, 1 table, accepted at MICCAI 2023 Workshop on\n Machine Learning in Medical Imaging (MLMI)"},{"id":"http://arxiv.org/abs/2308.15868v1","updated":"2023-08-30T08:56:36Z","published":"2023-08-30T08:56:36Z","title":"Feature Attention Network (FA-Net): A Deep-Learning Based Approach for\n Underwater Single Image Enhancement","summary":" Underwater image processing and analysis have been a hotspot of study in\nrecent years, as more emphasis has been focused to underwater monitoring and\nusage of marine resources. Compared with the open environment, underwater image\nencountered with more complicated conditions such as light abortion,\nscattering, turbulence, nonuniform illumination and color diffusion. Although\nconsiderable advances and enhancement techniques achieved in resolving these\nissues, they treat low-frequency information equally across the entire channel,\nwhich results in limiting the network's representativeness. We propose a deep\nlearning and feature-attention-based end-to-end network (FA-Net) to solve this\nproblem. In particular, we propose a Residual Feature Attention Block (RFAB),\ncontaining the channel attention, pixel attention, and residual learning\nmechanism with long and short skip connections. RFAB allows the network to\nfocus on learning high-frequency information while skipping low-frequency\ninformation on multi-hop connections. The channel and pixel attention mechanism\nconsiders each channel's different features and the uneven distribution of haze\nover different pixels in the image. The experimental results shows that the\nFA-Net propose by us provides higher accuracy, quantitatively and qualitatively\nand superiority to previous state-of-the-art methods.\n","authors":["Muhammad Hamza","Ammar Hawbani","Sami Ul Rehman","Xingfu Wang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.15868v1.pdf","comment":"Fourteenth International Conference on Digital Image Processing\n (ICDIP 2022), 2022, Wuhan, China, May 20-23, 2022.8 pages.5 Figures.doi:\n 10.1117/12.2644516"},{"id":"http://arxiv.org/abs/2308.15855v1","updated":"2023-08-30T08:44:21Z","published":"2023-08-30T08:44:21Z","title":"Semi-supervised Domain Adaptation with Inter and Intra-domain Mixing for\n Semantic Segmentation","summary":" Despite recent advances in semantic segmentation, an inevitable challenge is\nthe performance degradation caused by the domain shift in real application.\nCurrent dominant approach to solve this problem is unsupervised domain\nadaptation (UDA). However, the absence of labeled target data in UDA is overly\nrestrictive and limits performance. To overcome this limitation, a more\npractical scenario called semi-supervised domain adaptation (SSDA) has been\nproposed. Existing SSDA methods are derived from the UDA paradigm and primarily\nfocus on leveraging the unlabeled target data and source data. In this paper,\nwe highlight the significance of exploiting the intra-domain information\nbetween the limited labeled target data and unlabeled target data, as it\ngreatly benefits domain adaptation. Instead of solely using the scarce labeled\ndata for supervision, we propose a novel SSDA framework that incorporates both\ninter-domain mixing and intra-domain mixing, where inter-domain mixing\nmitigates the source-target domain gap and intra-domain mixing enriches the\navailable target domain information. By simultaneously learning from\ninter-domain mixing and intra-domain mixing, the network can capture more\ndomain-invariant features and promote its performance on the target domain. We\nalso explore different domain mixing operations to better exploit the target\ndomain information. Comprehensive experiments conducted on the GTA5toCityscapes\nand SYNTHIA2Cityscapes benchmarks demonstrate the effectiveness of our method,\nsurpassing previous methods by a large margin.\n","authors":["Weifu Fu","Qiang Nie","Jialin Li","Yuhuan Lin","Kai Wu","Yong Liu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15855v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.12436v2","updated":"2023-08-30T08:40:16Z","published":"2022-11-22T17:45:06Z","title":"Dynamic Depth-Supervised NeRF for Multi-View RGB-D Operating Room Images","summary":" The operating room (OR) is an environment of interest for the development of\nsensing systems, enabling the detection of people, objects, and their semantic\nrelations. Due to frequent occlusions in the OR, these systems often rely on\ninput from multiple cameras. While increasing the number of cameras generally\nincreases algorithm performance, there are hard limitations to the number and\nlocations of cameras in the OR. Neural Radiance Fields (NeRF) can be used to\nrender synthetic views from arbitrary camera positions, virtually enlarging the\nnumber of cameras in the dataset. In this work, we explore the use of NeRF for\nview synthesis of dynamic scenes in the OR, and we show that regularisation\nwith depth supervision from RGB-D sensor data results in higher image quality.\nWe optimise a dynamic depth-supervised NeRF with up to six synchronised cameras\nthat capture the surgical field in five distinct phases before and during a\nknee replacement surgery. We qualitatively inspect views rendered by a virtual\ncamera that moves 180 degrees around the surgical field at differing time\nvalues. Quantitatively, we evaluate view synthesis from an unseen camera\nposition in terms of PSNR, SSIM and LPIPS for the colour channels and in MAE\nand error percentage for the estimated depth. We find that NeRFs can be used to\ngenerate geometrically consistent views, also from interpolated camera\npositions and at interpolated time intervals. Views are generated from an\nunseen camera pose with an average PSNR of 18.2 and a depth estimation error of\n2.0%. Our results show the potential of a dynamic NeRF for view synthesis in\nthe OR and stress the relevance of depth supervision in a clinical setting.\n","authors":["Beerend G. A. Gerats","Jelmer M. Wolterink","Ivo A. M. J. Broeders"],"pdf_url":"https://arxiv.org/pdf/2211.12436v2.pdf","comment":"Accepted to the Workshop on Ambient Intelligence for HealthCare 2023"},{"id":"http://arxiv.org/abs/2308.15854v1","updated":"2023-08-30T08:40:15Z","published":"2023-08-30T08:40:15Z","title":"Zero-shot Inversion Process for Image Attribute Editing with Diffusion\n Models","summary":" Denoising diffusion models have shown outstanding performance in image\nediting. Existing works tend to use either image-guided methods, which provide\na visual reference but lack control over semantic coherence, or text-guided\nmethods, which ensure faithfulness to text guidance but lack visual quality. To\naddress the problem, we propose the Zero-shot Inversion Process (ZIP), a\nframework that injects a fusion of generated visual reference and text guidance\ninto the semantic latent space of a \\textit{frozen} pre-trained diffusion\nmodel. Only using a tiny neural network, the proposed ZIP produces diverse\ncontent and attributes under the intuitive control of the text prompt.\nMoreover, ZIP shows remarkable robustness for both in-domain and out-of-domain\nattribute manipulation on real images. We perform detailed experiments on\nvarious benchmark datasets. Compared to state-of-the-art methods, ZIP produces\nimages of equivalent quality while providing a realistic editing effect.\n","authors":["Zhanbo Feng","Zenan Ling","Ci Gong","Feng Zhou","Jie Li","Robert C. Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.15854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14847v2","updated":"2023-08-30T08:34:08Z","published":"2023-08-28T19:08:17Z","title":"NSF: Neural Surface Fields for Human Modeling from Monocular Depth","summary":" Obtaining personalized 3D animatable avatars from a monocular camera has\nseveral real world applications in gaming, virtual try-on, animation, and\nVR/XR, etc. However, it is very challenging to model dynamic and fine-grained\nclothing deformations from such sparse data. Existing methods for modeling 3D\nhumans from depth data have limitations in terms of computational efficiency,\nmesh coherency, and flexibility in resolution and topology. For instance,\nreconstructing shapes using implicit functions and extracting explicit meshes\nper frame is computationally expensive and cannot ensure coherent meshes across\nframes. Moreover, predicting per-vertex deformations on a pre-designed human\ntemplate with a discrete surface lacks flexibility in resolution and topology.\nTo overcome these limitations, we propose a novel method `\\keyfeature: Neural\nSurface Fields' for modeling 3D clothed humans from monocular depth. NSF\ndefines a neural field solely on the base surface which models a continuous and\nflexible displacement field. NSF can be adapted to the base surface with\ndifferent resolution and topology without retraining at inference time.\nCompared to existing approaches, our method eliminates the expensive per-frame\nsurface extraction while maintaining mesh coherency, and is capable of\nreconstructing meshes with arbitrary resolution without retraining. To foster\nresearch in this direction, we release our code in project page at:\nhttps://yuxuan-xue.com/nsf.\n","authors":["Yuxuan Xue","Bharat Lal Bhatnagar","Riccardo Marin","Nikolaos Sarafianos","Yuanlu Xu","Gerard Pons-Moll","Tony Tung"],"pdf_url":"https://arxiv.org/pdf/2308.14847v2.pdf","comment":"Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf"},{"id":"http://arxiv.org/abs/2308.15846v1","updated":"2023-08-30T08:33:13Z","published":"2023-08-30T08:33:13Z","title":"Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object\n Detection","summary":" In this paper, we for the first time explore helpful multi-modal contextual\nknowledge to understand novel categories for open-vocabulary object detection\n(OVD). The multi-modal contextual knowledge stands for the joint relationship\nacross regions and words. However, it is challenging to incorporate such\nmulti-modal contextual knowledge into OVD. The reason is that previous\ndetection frameworks fail to jointly model multi-modal contextual knowledge, as\nobject detectors only support vision inputs and no caption description is\nprovided at test time. To this end, we propose a multi-modal contextual\nknowledge distillation framework, MMC-Det, to transfer the learned contextual\nknowledge from a teacher fusion transformer with diverse multi-modal masked\nlanguage modeling (D-MLM) to a student detector. The diverse multi-modal masked\nlanguage modeling is realized by an object divergence constraint upon\ntraditional multi-modal masked language modeling (MLM), in order to extract\nfine-grained region-level visual contexts, which are vital to object detection.\nExtensive experiments performed upon various detection datasets show the\neffectiveness of our multi-modal context learning strategy, where our approach\nwell outperforms the recent state-of-the-art methods.\n","authors":["Yifan Xu","Mengdan Zhang","Xiaoshan Yang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2308.15846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15844v1","updated":"2023-08-30T08:31:55Z","published":"2023-08-30T08:31:55Z","title":"Reconstructing Groups of People with Hypergraph Relational Reasoning","summary":" Due to the mutual occlusion, severe scale variation, and complex spatial\ndistribution, the current multi-person mesh recovery methods cannot produce\naccurate absolute body poses and shapes in large-scale crowded scenes. To\naddress the obstacles, we fully exploit crowd features for reconstructing\ngroups of people from a monocular image. A novel hypergraph relational\nreasoning network is proposed to formulate the complex and high-order relation\ncorrelations among individuals and groups in the crowd. We first extract\ncompact human features and location information from the original\nhigh-resolution image. By conducting the relational reasoning on the extracted\nindividual features, the underlying crowd collectiveness and interaction\nrelationship can provide additional group information for the reconstruction.\nFinally, the updated individual features and the localization information are\nused to regress human meshes in camera coordinates. To facilitate the network\ntraining, we further build pseudo ground-truth on two crowd datasets, which may\nalso promote future research on pose estimation and human behavior\nunderstanding in crowded scenes. The experimental results show that our\napproach outperforms other baseline methods both in crowded and common\nscenarios. The code and datasets are publicly available at\nhttps://github.com/boycehbz/GroupRec.\n","authors":["Buzhen Huang","Jingyi Ju","Zhihao Li","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15844v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.15839v1","updated":"2023-08-30T08:21:52Z","published":"2023-08-30T08:21:52Z","title":"Utilizing Task-Generic Motion Prior to Recover Full-Body Motion from\n Very Sparse Signals","summary":" The most popular type of devices used to track a user's posture in a virtual\nreality experience consists of a head-mounted display and two controllers held\nin both hands. However, due to the limited number of tracking sensors (three in\ntotal), faithfully recovering the user in full-body is challenging, limiting\nthe potential for interactions among simulated user avatars within the virtual\nworld. Therefore, recent studies have attempted to reconstruct full-body poses\nusing neural networks that utilize previously learned human poses or accept a\nseries of past poses over a short period. In this paper, we propose a method\nthat utilizes information from a neural motion prior to improve the accuracy of\nreconstructed user's motions. Our approach aims to reconstruct user's full-body\nposes by predicting the latent representation of the user's overall motion from\nlimited input signals and integrating this information with tracking sensor\ninputs. This is based on the premise that the ultimate goal of pose\nreconstruction is to reconstruct the motion, which is a series of poses. Our\nresults show that this integration enables more accurate reconstruction of the\nuser's full-body motion, particularly enhancing the robustness of lower body\nmotion reconstruction from impoverished signals. Web:\nhttps://https://mjsh34.github.io/mp-sspe/\n","authors":["Myungjin Shin","Dohae Lee","In-Kwon Lee"],"pdf_url":"https://arxiv.org/pdf/2308.15839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06262v4","updated":"2023-08-30T08:21:13Z","published":"2023-01-16T05:08:50Z","title":"Collaborative Perception in Autonomous Driving: Methods, Datasets and\n Challenges","summary":" Collaborative perception is essential to address occlusion and sensor failure\nissues in autonomous driving. In recent years, theoretical and experimental\ninvestigations of novel works for collaborative perception have increased\ntremendously. So far, however, few reviews have focused on systematical\ncollaboration modules and large-scale collaborative perception datasets. This\nwork reviews recent achievements in this field to bridge this gap and motivate\nfuture research. We start with a brief overview of collaboration schemes. After\nthat, we systematically summarize the collaborative perception methods for\nideal scenarios and real-world issues. The former focuses on collaboration\nmodules and efficiency, and the latter is devoted to addressing the problems in\nactual application. Furthermore, we present large-scale public datasets and\nsummarize quantitative results on these benchmarks. Finally, we highlight gaps\nand overlook challenges between current academic research and real-world\napplications. The project page is\nhttps://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving\n","authors":["Yushan Han","Hui Zhang","Huifang Li","Yi Jin","Congyan Lang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2301.06262v4.pdf","comment":"18 pages, 6 figures. Accepted by IEEE Intelligent Transportation\n Systems Magazine. URL:\n https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving"},{"id":"http://arxiv.org/abs/2308.15321v2","updated":"2023-08-30T08:20:30Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v2.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2204.09398v2","updated":"2023-08-30T08:18:15Z","published":"2022-04-20T11:43:58Z","title":"Case-Aware Adversarial Training","summary":" The neural network (NN) becomes one of the most heated type of models in\nvarious signal processing applications. However, NNs are extremely vulnerable\nto adversarial examples (AEs). To defend AEs, adversarial training (AT) is\nbelieved to be the most effective method while due to the intensive\ncomputation, AT is limited to be applied in most applications. In this paper,\nto resolve the problem, we design a generic and efficient AT improvement\nscheme, namely case-aware adversarial training (CAT). Specifically, the\nintuition stems from the fact that a very limited part of informative samples\ncan contribute to most of model performance. Alternatively, if only the most\ninformative AEs are used in AT, we can lower the computation complexity of AT\nsignificantly as maintaining the defense effect. To achieve this, CAT achieves\ntwo breakthroughs. First, a method to estimate the information degree of\nadversarial examples is proposed for AE filtering. Second, to further enrich\nthe information that the NN can obtain from AEs, CAT involves a weight\nestimation and class-level balancing based sampling strategy to increase the\ndiversity of AT at each iteration. Extensive experiments show that CAT is\nfaster than vanilla AT by up to 3x while achieving competitive defense effect.\n","authors":["Mingyuan Fan","Yang Liu","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2204.09398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03512v3","updated":"2023-08-30T08:10:20Z","published":"2023-07-07T11:00:44Z","title":"Tranfer Learning of Semantic Segmentation Methods for Identifying Buried\n Archaeological Structures on LiDAR Data","summary":" When applying deep learning to remote sensing data in archaeological\nresearch, a notable obstacle is the limited availability of suitable datasets\nfor training models. The application of transfer learning is frequently\nemployed to mitigate this drawback. However, there is still a need to explore\nits effectiveness when applied across different archaeological datasets. This\npaper compares the performance of various transfer learning configurations\nusing two semantic segmentation deep neural networks on two LiDAR datasets. The\nexperimental results indicate that transfer learning-based approaches in\narchaeology can lead to performance improvements, although a systematic\nenhancement has not yet been observed. We provide specific insights about the\nvalidity of such techniques that can serve as a baseline for future works.\n","authors":["Gregory Sech","Paolo Soleni","Wouter B. Verschoof-van der Vaart","Žiga Kokalj","Arianna Traviglia","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.03512v3.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2023 (IGARSS 2023) @IEEE copyright"},{"id":"http://arxiv.org/abs/2308.15829v1","updated":"2023-08-30T08:09:40Z","published":"2023-08-30T08:09:40Z","title":"Early Detection of Red Palm Weevil Infestations using Deep Learning\n Classification of Acoustic Signals","summary":" The Red Palm Weevil (RPW), also known as the palm weevil, is considered among\nthe world's most damaging insect pests of palms. Current detection techniques\ninclude the detection of symptoms of RPW using visual or sound inspection and\nchemical detection of volatile signatures generated by infested palm trees.\nHowever, efficient detection of RPW diseases at an early stage is considered\none of the most challenging issues for cultivating date palms. In this paper,\nan efficient approach to the early detection of RPW is proposed. The proposed\napproach is based on RPW sound activities being recorded and analyzed. The\nfirst step involves the conversion of sound data into images based on a\nselected set of features. The second step involves the combination of images\nfrom the same sound file but computed by different features into a single\nimage. The third step involves the application of different Deep Learning (DL)\ntechniques to classify resulting images into two classes: infested and not\ninfested. Experimental results show good performances of the proposed approach\nfor RPW detection using different DL techniques, namely MobileNetV2,\nResNet50V2, ResNet152V2, VGG16, VGG19, DenseNet121, DenseNet201, Xception, and\nInceptionV3. The proposed approach outperformed existing techniques for public\ndatasets.\n","authors":["Wadii Boulila","Ayyub Alzahem","Anis Koubaa","Bilel Benjdira","Adel Ammar"],"pdf_url":"https://arxiv.org/pdf/2308.15829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15827v1","updated":"2023-08-30T08:03:49Z","published":"2023-08-30T08:03:49Z","title":"Introducing Language Guidance in Prompt-based Continual Learning","summary":" Continual Learning aims to learn a single model on a sequence of tasks\nwithout having access to data from previous tasks. The biggest challenge in the\ndomain still remains catastrophic forgetting: a loss in performance on seen\nclasses of earlier tasks. Some existing methods rely on an expensive replay\nbuffer to store a chunk of data from previous tasks. This, while promising,\nbecomes expensive when the number of tasks becomes large or data can not be\nstored for privacy reasons. As an alternative, prompt-based methods have been\nproposed that store the task information in a learnable prompt pool. This\nprompt pool instructs a frozen image encoder on how to solve each task. While\nthe model faces a disjoint set of classes in each task in this setting, we\nargue that these classes can be encoded to the same embedding space of a\npre-trained language encoder. In this work, we propose Language Guidance for\nPrompt-based Continual Learning (LGCL) as a plug-in for prompt-based methods.\nLGCL is model agnostic and introduces language guidance at the task level in\nthe prompt pool and at the class level on the output feature of the vision\nencoder. We show with extensive experimentation that LGCL consistently improves\nthe performance of prompt-based continual learning methods to set a new\nstate-of-the art. LGCL achieves these performance improvements without needing\nany additional learnable parameters.\n","authors":["Muhammad Gul Zain Ali Khan","Muhammad Ferjad Naeem","Luc Van Gool","Didier Stricker","Federico Tombari","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2308.15827v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2209.14624v2","updated":"2023-08-30T08:01:22Z","published":"2022-09-29T08:38:30Z","title":"Is Complexity Required for Neural Network Pruning? A Case Study on\n Global Magnitude Pruning","summary":" Pruning neural networks has become popular in the last decade when it was\nshown that a large number of weights can be safely removed from modern neural\nnetworks without compromising accuracy. Numerous pruning methods have been\nproposed since then, each claiming to be better than the previous. Many\nstate-of-the-art (SOTA) techniques today rely on complex pruning methodologies\nutilizing importance scores, getting feedback through back-propagation or\nhaving heuristics-based pruning rules amongst others. In this work, we question\nwhether this pattern of introducing complexity is really necessary to achieve\nbetter pruning results. We benchmark these SOTA techniques against a naive\npruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks\nweights in order of their magnitudes and prunes the smallest ones. Hence, in\nits vanilla form, it is one of the simplest pruning techniques. Surprisingly,\nwe find that vanilla Global MP outperforms all the other SOTA techniques and\nachieves a new SOTA result. It also achieves promising performance on FLOPs\nsparsification, which we find is enhanced, when pruning is conducted in a\ngradual fashion. We also find that Global MP is generalizable across tasks,\ndatasets, and models with superior performance. Moreover, a common issue that\nmany pruning algorithms run into at high sparsity rates, namely,\nlayer-collapse, can be easily fixed in Global MP by setting a minimum threshold\nof weights to be retained in each layer. Lastly, unlike many other SOTA\ntechniques, Global MP does not require any additional algorithm specific\nhyper-parameters and is very straightforward to tune and implement. We showcase\nour findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1\nand FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is\navailable at https://github.com/manasgupta-1/GlobalMP.\n","authors":["Manas Gupta","Efe Camci","Vishandi Rudy Keneta","Abhishek Vaidyanathan","Ritwik Kanodia","Chuan-Sheng Foo","Wu Min","Lin Jie"],"pdf_url":"https://arxiv.org/pdf/2209.14624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15822v1","updated":"2023-08-30T07:48:32Z","published":"2023-08-30T07:48:32Z","title":"AMDNet23: A combined deep Contour-based Convolutional Neural Network and\n Long Short Term Memory system to diagnose Age-related Macular Degeneration","summary":" In light of the expanding population, an automated framework of disease\ndetection can assist doctors in the diagnosis of ocular diseases, yields\naccurate, stable, rapid outcomes, and improves the success rate of early\ndetection. The work initially intended the enhancing the quality of fundus\nimages by employing an adaptive contrast enhancement algorithm (CLAHE) and\nGamma correction. In the preprocessing techniques, CLAHE elevates the local\ncontrast of the fundus image and gamma correction increases the intensity of\nrelevant features. This study operates on a AMDNet23 system of deep learning\nthat combined the neural networks made up of convolutions (CNN) and short-term\nand long-term memory (LSTM) to automatically detect aged macular degeneration\n(AMD) disease from fundus ophthalmology. In this mechanism, CNN is utilized for\nextracting features and LSTM is utilized to detect the extracted features. The\ndataset of this research is collected from multiple sources and afterward\napplied quality assessment techniques, 2000 experimental fundus images\nencompass four distinct classes equitably. The proposed hybrid deep AMDNet23\nmodel demonstrates to detection of AMD ocular disease and the experimental\nresult achieved an accuracy 96.50%, specificity 99.32%, sensitivity 96.5%, and\nF1-score 96.49.0%. The system achieves state-of-the-art findings on fundus\nimagery datasets to diagnose AMD ocular disease and findings effectively\npotential of our method.\n","authors":["Md. Aiyub Ali","Md. Shakhawat Hossain","Md. Kawar Hossain","Subhadra Soumi Sikder","Sharun Akter Khushbu","Mirajul Islam"],"pdf_url":"https://arxiv.org/pdf/2308.15822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15816v1","updated":"2023-08-30T07:41:26Z","published":"2023-08-30T07:41:26Z","title":"Improving Underwater Visual Tracking With a Large Scale Dataset and\n Image Enhancement","summary":" This paper presents a new dataset and general tracker enhancement method for\nUnderwater Visual Object Tracking (UVOT). Despite its significance, underwater\ntracking has remained unexplored due to data inaccessibility. It poses distinct\nchallenges; the underwater environment exhibits non-uniform lighting\nconditions, low visibility, lack of sharpness, low contrast, camouflage, and\nreflections from suspended particles. Performance of traditional tracking\nmethods designed primarily for terrestrial or open-air scenarios drops in such\nconditions. We address the problem by proposing a novel underwater image\nenhancement algorithm designed specifically to boost tracking quality. The\nmethod has resulted in a significant performance improvement, of up to 5.0%\nAUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate\nUVOT methods, large-scale datasets are required. To this end, we introduce a\nlarge-scale UVOT benchmark dataset consisting of 400 video segments and 275,000\nmanually annotated frames enabling underwater training and evaluation of deep\ntrackers. The videos are labelled with several underwater-specific tracking\nattributes including watercolor variation, target distractors, camouflage,\ntarget relative size, and low visibility conditions. The UVOT400 dataset,\ntracking results, and the code are publicly available on:\nhttps://github.com/BasitAlawode/UWVOT400.\n","authors":["Basit Alawode","Fayaz Ali Dharejo","Mehnaz Ummar","Yuhang Guo","Arif Mahmood","Naoufel Werghi","Fahad Shahbaz Khan","Sajid Javed"],"pdf_url":"https://arxiv.org/pdf/2308.15816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15807v1","updated":"2023-08-30T07:23:32Z","published":"2023-08-30T07:23:32Z","title":"ACNPU: A 4.75TOPS/W 1080P@30FPS Super Resolution Accelerator with\n Decoupled Asymmetric Convolution","summary":" Deep learning-driven superresolution (SR) outperforms traditional techniques\nbut also faces the challenge of high complexity and memory bandwidth. This\nchallenge leads many accelerators to opt for simpler and shallow models like\nFSRCNN, compromising performance for real-time needs, especially for\nresource-limited edge devices. This paper proposes an energy-efficient SR\naccelerator, ACNPU, to tackle this challenge. The ACNPU enhances image quality\nby 0.34dB with a 27-layer model, but needs 36\\% less complexity than FSRCNN,\nwhile maintaining a similar model size, with the \\textit{decoupled asymmetric\nconvolution and split-bypass structure}. The hardware-friendly 17K-parameter\nmodel enables \\textit{holistic model fusion} instead of localized layer fusion\nto remove external DRAM access of intermediate feature maps. The on-chip memory\nbandwidth is further reduced with the \\textit{input stationary flow} and\n\\textit{parallel-layer execution} to reduce power consumption. Hardware is\nregular and easy to control to support different layers by \\textit{processing\nelements (PEs) clusters with reconfigurable input and uniform data flow}. The\nimplementation in the 40 nm CMOS process consumes 2333 K gate counts and 198KB\nSRAMs. The ACNPU achieves 31.7 FPS and 124.4 FPS for x2 and x4 scales Full-HD\ngeneration, respectively, which attains 4.75 TOPS/W energy efficiency.\n","authors":["Tun-Hao Yang","Tian-Sheuan Chang"],"pdf_url":"https://arxiv.org/pdf/2308.15807v1.pdf","comment":"9 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.07016v2","updated":"2023-08-30T07:01:42Z","published":"2023-08-14T09:04:06Z","title":"HHTrack: Hyperspectral Object Tracking Using Hybrid Attention","summary":" Hyperspectral imagery provides abundant spectral information beyond the\nvisible RGB bands, offering rich discriminative details about objects in a\nscene. Leveraging such data has the potential to enhance visual tracking\nperformance. In this paper, we propose a hyperspectral object tracker based on\nhybrid attention (HHTrack). The core of HHTrack is a hyperspectral hybrid\nattention (HHA) module that unifies feature extraction and fusion within one\ncomponent through token interactions. A hyperspectral bands fusion (HBF) module\nis also introduced to selectively aggregate spatial and spectral signatures\nfrom the full hyperspectral input. Extensive experiments demonstrate the\nstate-of-the-art performance of HHTrack on benchmark Near Infrared (NIR), Red\nNear Infrared (Red-NIR), and Visible (VIS) hyperspectral tracking datasets. Our\nwork provides new insights into harnessing the strengths of transformers and\nhyperspectral fusion to advance robust object tracking.\n","authors":["Yuedong Tan"],"pdf_url":"https://arxiv.org/pdf/2308.07016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11499v3","updated":"2023-08-30T06:57:57Z","published":"2022-08-24T12:47:58Z","title":"Semi-supervised Semantic Segmentation with Mutual Knowledge Distillation","summary":" Consistency regularization has been widely studied in recent semisupervised\nsemantic segmentation methods, and promising performance has been achieved. In\nthis work, we propose a new consistency regularization framework, termed mutual\nknowledge distillation (MKD), combined with data and feature augmentation. We\nintroduce two auxiliary mean-teacher models based on consistency\nregularization. More specifically, we use the pseudo-labels generated by a mean\nteacher to supervise the student network to achieve a mutual knowledge\ndistillation between the two branches. In addition to using image-level strong\nand weak augmentation, we also discuss feature augmentation. This involves\nconsidering various sources of knowledge to distill the student network. Thus,\nwe can significantly increase the diversity of the training samples.\nExperiments on public benchmarks show that our framework outperforms previous\nstate-of-the-art (SOTA) methods under various semi-supervised settings. Code is\navailable at semi-mmseg.\n","authors":["Jianlong Yuan","Jinchao Ge","Zhibin Wang","Yifan Liu"],"pdf_url":"https://arxiv.org/pdf/2208.11499v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15795v1","updated":"2023-08-30T06:56:53Z","published":"2023-08-30T06:56:53Z","title":"Occlusion-Aware Detection and Re-ID Calibrated Network for Multi-Object\n Tracking","summary":" Multi-Object Tracking (MOT) is a crucial computer vision task that aims to\npredict the bounding boxes and identities of objects simultaneously. While\nstate-of-the-art methods have made remarkable progress by jointly optimizing\nthe multi-task problems of detection and Re-ID feature learning, yet, few\napproaches explore to tackle the occlusion issue, which is a long-standing\nchallenge in the MOT field. Generally, occluded objects may hinder the detector\nfrom estimating the bounding boxes, resulting in fragmented trajectories. And\nthe learned occluded Re-ID embeddings are less distinct since they contain\ninterferer. To this end, we propose an occlusion-aware detection and Re-ID\ncalibrated network for multi-object tracking, termed as ORCTrack. Specifically,\nwe propose an Occlusion-Aware Attention (OAA) module in the detector that\nhighlights the object features while suppressing the occluded background\nregions. OAA can serve as a modulator that enhances the detector for some\npotentially occluded objects. Furthermore, we design a Re-ID embedding matching\nblock based on the optimal transport problem, which focuses on enhancing and\ncalibrating the Re-ID representations through different adjacent frames\ncomplementarily. To validate the effectiveness of the proposed method,\nextensive experiments are conducted on two challenging VisDrone2021-MOT and\nKITTI benchmarks. Experimental evaluations demonstrate the superiority of our\napproach, which can achieve new state-of-the-art performance and enjoy high\nrun-time efficiency.\n","authors":["Yukun Su","Ruizhou Sun","Xin Shu","Yu Zhang","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2308.15795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15791v1","updated":"2023-08-30T06:49:34Z","published":"2023-08-30T06:49:34Z","title":"Neural Video Compression with Temporal Layer-Adaptive Hierarchical\n B-frame Coding","summary":" Neural video compression (NVC) is a rapidly evolving video coding research\narea, with some models achieving superior coding efficiency compared to the\nlatest video coding standard Versatile Video Coding (VVC). In conventional\nvideo coding standards, the hierarchical B-frame coding, which utilizes a\nbidirectional prediction structure for higher compression, had been\nwell-studied and exploited. In NVC, however, limited research has investigated\nthe hierarchical B scheme. In this paper, we propose an NVC model exploiting\nhierarchical B-frame coding with temporal layer-adaptive optimization. We first\nextend an existing unidirectional NVC model to a bidirectional model, which\nachieves -21.13% BD-rate gain over the unidirectional baseline model. However,\nthis model faces challenges when applied to sequences with complex or large\nmotions, leading to performance degradation. To address this, we introduce\ntemporal layer-adaptive optimization, incorporating methods such as temporal\nlayer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent\nscaling (TALS). The final model with the proposed methods achieves an\nimpressive BD-rate gain of -39.86% against the baseline. It also resolves the\nchallenges in sequences with large or complex motions with up to -49.13% more\nBD-rate gains than the simple bidirectional extension. This improvement is\nattributed to the allocation of more bits to lower temporal layers, thereby\nenhancing overall reconstruction quality with smaller bits. Since our method\nhas little dependency on a specific NVC model architecture, it can serve as a\ngeneral tool for extending unidirectional NVC models to the ones with\nhierarchical B-frame coding.\n","authors":["Yeongwoong Kim","Suyong Bahk","Seungeon Kim","Won Hee Lee","Dokwan Oh","Hui Yong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07394v3","updated":"2023-08-30T06:36:08Z","published":"2022-06-15T08:55:47Z","title":"Efficient Adaptive Ensembling for Image Classification","summary":" In recent times, with the exception of sporadic cases, the trend in Computer\nVision is to achieve minor improvements compared to considerable increases in\ncomplexity.\n To reverse this trend, we propose a novel method to boost image\nclassification performances without increasing complexity.\n To this end, we revisited ensembling, a powerful approach, often not used\nproperly due to its more complex nature and the training time, so as to make it\nfeasible through a specific design choice. First, we trained two\nEfficientNet-b0 end-to-end models (known to be the architecture with the best\noverall accuracy/complexity trade-off for image classification) on disjoint\nsubsets of data (i.e. bagging). Then, we made an efficient adaptive ensemble by\nperforming fine-tuning of a trainable combination layer. In this way, we were\nable to outperform the state-of-the-art by an average of 0.5$\\%$ on the\naccuracy, with restrained complexity both in terms of the number of parameters\n(by 5-60 times), and the FLoating point Operations Per Second (FLOPS) by 10-100\ntimes on several major benchmark datasets.\n","authors":["Antonio Bruno","Davide Moroni","Massimo Martinelli"],"pdf_url":"https://arxiv.org/pdf/2206.07394v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04557v2","updated":"2023-08-30T06:30:43Z","published":"2023-03-08T13:15:19Z","title":"Scene Matters: Model-based Deep Video Compression","summary":" Video compression has always been a popular research area, where many\ntraditional and deep video compression methods have been proposed. These\nmethods typically rely on signal prediction theory to enhance compression\nperformance by designing high efficient intra and inter prediction strategies\nand compressing video frames one by one. In this paper, we propose a novel\nmodel-based video compression (MVC) framework that regards scenes as the\nfundamental units for video sequences. Our proposed MVC directly models the\nintensity variation of the entire video sequence in one scene, seeking\nnon-redundant representations instead of reducing redundancy through\nspatio-temporal predictions. To achieve this, we employ implicit neural\nrepresentation as our basic modeling architecture. To improve the efficiency of\nvideo modeling, we first propose context-related spatial positional embedding\nand frequency domain supervision in spatial context enhancement. For temporal\ncorrelation capturing, we design the scene flow constrain mechanism and\ntemporal contrastive loss. Extensive experimental results demonstrate that our\nmethod achieves up to a 20\\% bitrate reduction compared to the latest video\ncoding standard H.266 and is more efficient in decoding than existing video\ncoding strategies.\n","authors":["Lv Tang","Xinfeng Zhang","Gai Zhang","Xiaoqi Ma"],"pdf_url":"https://arxiv.org/pdf/2303.04557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10799v2","updated":"2023-08-30T05:01:31Z","published":"2023-06-19T09:39:10Z","title":"SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend\n 3D Talking Faces","summary":" Speech-driven 3D face animation technique, extending its applications to\nvarious multimedia fields. Previous research has generated promising realistic\nlip movements and facial expressions from audio signals. However, traditional\nregression models solely driven by data face several essential problems, such\nas difficulties in accessing precise labels and domain gaps between different\nmodalities, leading to unsatisfactory results lacking precision and coherence.\nTo enhance the visual accuracy of generated lip movement while reducing the\ndependence on labeled data, we propose a novel framework SelfTalk, by involving\nself-supervision in a cross-modals network system to learn 3D talking faces.\nThe framework constructs a network system consisting of three modules: facial\nanimator, speech recognizer, and lip-reading interpreter. The core of SelfTalk\nis a commutative training diagram that facilitates compatible features exchange\namong audio, text, and lip shape, enabling our models to learn the intricate\nconnection between these factors. The proposed framework leverages the\nknowledge learned from the lip-reading interpreter to generate more plausible\nlip shapes. Extensive experiments and user studies demonstrate that our\nproposed approach achieves state-of-the-art performance both qualitatively and\nquantitatively. We recommend watching the supplementary video.\n","authors":["Ziqiao Peng","Yihao Luo","Yue Shi","Hao Xu","Xiangyu Zhu","Jun He","Hongyan Liu","Zhaoxin Fan"],"pdf_url":"https://arxiv.org/pdf/2306.10799v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2304.04027v3","updated":"2023-08-30T04:55:04Z","published":"2023-04-08T14:40:35Z","title":"Estimating 3D Dental Structures using Simulated Panoramic Radiographs\n and Neural Ray Tracing","summary":" Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality\nfor dental examination. However, PX only provides a flattened 2D image, lacking\nin a 3D view of the oral structure. In this paper, we propose a framework to\nestimate 3D oral structures from real-world PX. Our framework tackles full 3D\nreconstruction for varying subjects (patients) where each reconstruction is\nbased only on a single panoramic image. We create an intermediate\nrepresentation called simulated PX (SimPX) from 3D Cone-beam computed\ntomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and\nrotational principles of PX imaging. SimPX aims at not only truthfully\nsimulating PX, but also facilitates the reverting process back to 3D data. We\npropose a novel neural model based on ray tracing which exploits both global\nand local input features to convert SimPX to 3D output. At inference, a real PX\nimage is translated to a SimPX-style image with semantic regularization, and\nthe translated image is processed by generation module to produce high-quality\noutputs. Experiments show that our method outperforms prior state-of-the-art in\nreconstruction tasks both quantitatively and qualitatively. Unlike prior\nmethods, Our method does not require any prior information such as the shape of\ndental arches, nor the matched PX-CBCT dataset for training, which is difficult\nto obtain in clinical practice.\n","authors":["Sihwa Park","Seongjun Kim","Doeyoung Kwon","Yohan Jang","In-Seok Song","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2304.04027v3.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2306.01762v2","updated":"2023-08-30T04:53:15Z","published":"2023-05-27T06:00:51Z","title":"Pre-trained transformer for adversarial purification","summary":" With more and more deep neural networks being deployed as various daily\nservices, their reliability is essential. It's frightening that deep neural\nnetworks are vulnerable and sensitive to adversarial attacks, the most common\none of which for the services is evasion-based. Recent works usually strengthen\nthe robustness by adversarial training or leveraging the knowledge of an amount\nof clean data. However, in practical terms, retraining and redeploying the\nmodel need a large computational budget, leading to heavy losses to the online\nservice. In addition, when adversarial examples of a certain attack are\ndetected, only limited adversarial examples are available for the service\nprovider, while much clean data may not be accessible. Given the mentioned\nproblems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is\nto rapidly defend against a certain attack for the frozen original service\nmodel with limitations of few clean and adversarial examples. Motivated by the\ngeneralization and the universal computation ability of pre-trained transformer\nmodels, we come up with a new defender method, CeTaD, which stands for\nConsidering Pre-trained Transformers as Defenders. In particular, we evaluate\nthe effectiveness and the transferability of CeTaD in the case of one-shot\nadversarial examples and explore the impact of different parts of CeTaD as well\nas training data conditions. CeTaD is flexible, able to be embedded into an\narbitrary differentiable model, and suitable for various types of attacks.\n","authors":["Kai Wu","Yujian Betterest Li","Xiaoyu Zhang","Handing Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.01762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01418v3","updated":"2023-08-30T04:41:10Z","published":"2023-03-02T17:09:27Z","title":"Human Motion Diffusion as a Generative Prior","summary":" Recent work has demonstrated the significant potential of denoising diffusion\nmodels for generating human motion, including text-to-motion capabilities.\nHowever, these methods are restricted by the paucity of annotated motion data,\na focus on single-person motions, and a lack of detailed control. In this\npaper, we introduce three forms of composition based on diffusion priors:\nsequential, parallel, and model composition. Using sequential composition, we\ntackle the challenge of long sequence generation. We introduce DoubleTake, an\ninference-time method with which we generate long animations consisting of\nsequences of prompted intervals and their transitions, using a prior trained\nonly for short clips. Using parallel composition, we show promising steps\ntoward two-person generation. Beginning with two fixed priors as well as a few\ntwo-person training examples, we learn a slim communication block, ComMDM, to\ncoordinate interaction between the two resulting motions. Lastly, using model\ncomposition, we first train individual priors to complete motions that realize\na prescribed motion for a given joint. We then introduce DiffusionBlending, an\ninterpolation mechanism to effectively blend several such models to enable\nflexible and efficient fine-grained joint and trajectory-level control and\nediting. We evaluate the composition methods using an off-the-shelf motion\ndiffusion model, and further compare the results to dedicated models trained\nfor these specific tasks.\n","authors":["Yonatan Shafir","Guy Tevet","Roy Kapon","Amit H. Bermano"],"pdf_url":"https://arxiv.org/pdf/2303.01418v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15752v1","updated":"2023-08-30T04:29:48Z","published":"2023-08-30T04:29:48Z","title":"Large-scale data extraction from the UNOS organ donor documents","summary":" The scope of our study is all UNOS data of the USA organ donors since 2008.\nThe data is not analyzable in a large scale in the past because it was captured\nin PDF documents known as \"Attachments\", whereby every donor is represented by\ndozens of PDF documents in heterogenous formats. To make the data analyzable,\none needs to convert the content inside these PDFs to an analyzable data\nformat, such as a standard SQL database. In this paper we will focus on 2022\nUNOS data comprised of $\\approx 400,000$ PDF documents spanning millions of\npages. The totality of UNOS data covers 15 years (2008--20022) and our results\nwill be quickly extended to the entire data. Our method captures a portion of\nthe data in DCD flowsheets, kidney perfusion data, and data captured during\npatient hospital stay (e.g. vital signs, ventilator settings, etc.). The\ncurrent paper assumes that the reader is familiar with the content of the UNOS\ndata. The overview of the types of data and challenges they present is a\nsubject of another paper. Here we focus on demonstrating that the goal of\nbuilding a comprehensive, analyzable database from UNOS documents is an\nattainable task, and we provide an overview of our methodology. The project\nresulted in datasets by far larger than previously available even in this\npreliminary phase.\n","authors":["Marek Rychlik","Bekir Tanriover","Yan Han"],"pdf_url":"https://arxiv.org/pdf/2308.15752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03741v4","updated":"2023-08-30T04:18:50Z","published":"2022-12-07T16:10:08Z","title":"FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance\n Generation","summary":" Generating full-body and multi-genre dance sequences from given music is a\nchallenging task, due to the limitations of existing datasets and the inherent\ncomplexity of the fine-grained hand motion and dance genres. To address these\nproblems, we propose FineDance, which contains 14.6 hours of music-dance paired\ndata, with fine-grained hand motions, fine-grained genres (22 dance genres),\nand accurate posture. To the best of our knowledge, FineDance is the largest\nmusic-dance paired dataset with the most dance genres. Additionally, to address\nmonotonous and unnatural hand movements existing in previous methods, we\npropose a full-body dance generation network, which utilizes the diverse\ngeneration capabilities of the diffusion model to solve monotonous problems,\nand use expert nets to solve unreal problems. To further enhance the\ngenre-matching and long-term stability of generated dances, we propose a\nGenre&Coherent aware Retrieval Module. Besides, we propose a novel metric named\nGenre Matching Score to evaluate the genre-matching degree between dance and\nmusic. Quantitative and qualitative experiments demonstrate the quality of\nFineDance, and the state-of-the-art performance of FineNet. The FineDance\nDataset and more qualitative samples can be found at our website.\n","authors":["Ronghui Li","Junfan Zhao","Yachao Zhang","Mingyang Su","Zeping Ren","Han Zhang","Yansong Tang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2212.03741v4.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.12596v2","updated":"2023-08-30T03:55:54Z","published":"2023-05-21T23:10:14Z","title":"iWarpGAN: Disentangling Identity and Style to Generate Synthetic Iris\n Images","summary":" Generative Adversarial Networks (GANs) have shown success in approximating\ncomplex distributions for synthetic image generation. However, current\nGAN-based methods for generating biometric images, such as iris, have certain\nlimitations: (a) the synthetic images often closely resemble images in the\ntraining dataset; (b) the generated images lack diversity in terms of the\nnumber of unique identities represented in them; and (c) it is difficult to\ngenerate multiple images pertaining to the same identity. To overcome these\nissues, we propose iWarpGAN that disentangles identity and style in the context\nof the iris modality by using two transformation pathways: Identity\nTransformation Pathway to generate unique identities from the training set, and\nStyle Transformation Pathway to extract the style code from a reference image\nand output an iris image using this style. By concatenating the transformed\nidentity code and reference style code, iWarpGAN generates iris images with\nboth inter- and intra-class variations. The efficacy of the proposed method in\ngenerating such iris DeepFakes is evaluated both qualitatively and\nquantitatively using ISO/IEC 29794-6 Standard Quality Metrics and the VeriEye\niris matcher. Further, the utility of the synthetically generated images is\ndemonstrated by improving the performance of deep learning based iris matchers\nthat augment synthetic data with real data during the training process.\n","authors":["Shivangi Yadav","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2305.12596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15740v1","updated":"2023-08-30T03:35:55Z","published":"2023-08-30T03:35:55Z","title":"Beard Segmentation and Recognition Bias","summary":" A person's facial hairstyle, such as presence and size of beard, can\nsignificantly impact face recognition accuracy. There are publicly-available\ndeep networks that achieve reasonable accuracy at binary attribute\nclassification, such as beard / no beard, but few if any that segment the\nfacial hair region. To investigate the effect of facial hair in a rigorous\nmanner, we first created a set of fine-grained facial hair annotations to train\na segmentation model and evaluate its accuracy across African-American and\nCaucasian face images. We then use our facial hair segmentations to categorize\nimage pairs according to the degree of difference or similarity in the facial\nhairstyle. We find that the False Match Rate (FMR) for image pairs with\ndifferent categories of facial hairstyle varies by a factor of over 10 for\nAfrican-American males and over 25 for Caucasian males. To reduce the bias\nacross image pairs with different facial hairstyles, we propose a scheme for\nadaptive thresholding based on facial hairstyle similarity. Evaluation on a\nsubject-disjoint set of images shows that adaptive similarity thresholding\nbased on facial hairstyles of the image pair reduces the ratio between the\nhighest and lowest FMR across facial hairstyle categories for African-American\nfrom 10.7 to 1.8 and for Caucasians from 25.9 to 1.3. Facial hair annotations\nand facial hair segmentation model will be publicly available.\n","authors":["Kagan Ozturk","Grace Bezold","Aman Bhatta","Haiyu Wu","Kevin Bowyer"],"pdf_url":"https://arxiv.org/pdf/2308.15740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02862v2","updated":"2023-08-30T03:21:29Z","published":"2023-03-06T03:27:17Z","title":"EvHandPose: Event-based 3D Hand Pose Estimation with Sparse Supervision","summary":" Event camera shows great potential in 3D hand pose estimation, especially\naddressing the challenges of fast motion and high dynamic range in a low-power\nway. However, due to the asynchronous differential imaging mechanism, it is\nchallenging to design event representation to encode hand motion information\nespecially when the hands are not moving (causing motion ambiguity), and it is\ninfeasible to fully annotate the temporally dense event stream. In this paper,\nwe propose EvHandPose with novel hand flow representations in Event-to-Pose\nmodule for accurate hand pose estimation and alleviating the motion ambiguity\nissue. To solve the problem under sparse annotation, we design contrast\nmaximization and hand-edge constraints in Pose-to-IWE (Image with Warped\nEvents) module and formulate EvHandPose in a weakly-supervision framework. We\nfurther build EvRealHands, the first large-scale real-world event-based hand\npose dataset on several challenging scenes to bridge the real-synthetic domain\ngap. Experiments on EvRealHands demonstrate that EvHandPose outperforms\nprevious event-based methods under all evaluation scenes, achieves accurate and\nstable hand pose estimation with high temporal resolution in fast motion and\nstrong light scenes compared with RGB-based methods, generalizes well to\noutdoor scenes and another type of event camera, and shows the potential for\nthe hand gesture recognition task.\n","authors":["Jianping Jiang","Jiahe Li","Baowen Zhang","Xiaoming Deng","Boxin Shi"],"pdf_url":"https://arxiv.org/pdf/2303.02862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15733v1","updated":"2023-08-30T03:17:57Z","published":"2023-08-30T03:17:57Z","title":"Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale\n Drone Survey","summary":" Neural rendering has garnered substantial attention owing to its capacity for\ncreating realistic 3D scenes. However, its applicability to extensive scenes\nremains challenging, with limitations in effectiveness. In this work, we\npropose the Drone-NeRF framework to enhance the efficient reconstruction of\nunbounded large-scale scenes suited for drone oblique photography using Neural\nRadiance Fields (NeRF). Our approach involves dividing the scene into uniform\nsub-blocks based on camera position and depth visibility. Sub-scenes are\ntrained in parallel using NeRF, then merged for a complete scene. We refine the\nmodel by optimizing camera poses and guiding NeRF with a uniform sampler.\nIntegrating chosen samples enhances accuracy. A hash-coded fusion MLP\naccelerates density representation, yielding RGB and Depth outputs. Our\nframework accounts for sub-scene constraints, reduces parallel-training noise,\nhandles shadow occlusion, and merges sub-regions for a polished rendering\nresult. This Drone-NeRF framework demonstrates promising capabilities in\naddressing challenges related to scene complexity, rendering efficiency, and\naccuracy in drone-obtained imagery.\n","authors":["Zhihao Jia","Bing Wang","Changhao Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15733v1.pdf","comment":"15 pages, 7 figures, in submission"},{"id":"http://arxiv.org/abs/2303.07543v4","updated":"2023-08-30T03:12:34Z","published":"2023-03-14T00:13:57Z","title":"WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant\n Analysis","summary":" Deep neural networks are susceptible to generating overconfident yet\nerroneous predictions when presented with data beyond known concepts. This\nchallenge underscores the importance of detecting out-of-distribution (OOD)\nsamples in the open world. In this work, we propose a novel feature-space OOD\ndetection score based on class-specific and class-agnostic information.\nSpecifically, the approach utilizes Whitened Linear Discriminant Analysis to\nproject features into two subspaces - the discriminative and residual subspaces\n- for which the in-distribution (ID) classes are maximally separated and\nclosely clustered, respectively. The OOD score is then determined by combining\nthe deviation from the input data to the ID pattern in both subspaces. The\nefficacy of our method, named WDiscOOD, is verified on the large-scale\nImageNet-1k benchmark, with six OOD datasets that cover a variety of\ndistribution shifts. WDiscOOD demonstrates superior performance on deep\nclassifiers with diverse backbone architectures, including CNN and vision\ntransformer. Furthermore, we also show that WDiscOOD more effectively detects\nnovel concepts in representation spaces trained with contrastive objectives,\nincluding supervised contrastive loss and multi-modality contrastive loss.\n","authors":["Yiye Chen","Yunzhi Lin","Ruinian Xu","Patricio A. Vela"],"pdf_url":"https://arxiv.org/pdf/2303.07543v4.pdf","comment":"Accepted by ICCV 2023. Code is available at:\n https://github.com/ivalab/WDiscOOD.git"},{"id":"http://arxiv.org/abs/2305.04466v2","updated":"2023-08-30T03:10:19Z","published":"2023-05-08T05:34:15Z","title":"Generalized Universal Domain Adaptation with Generative Flow Networks","summary":" We introduce a new problem in unsupervised domain adaptation, termed as\nGeneralized Universal Domain Adaptation (GUDA), which aims to achieve precise\nprediction of all target labels including unknown categories. GUDA bridges the\ngap between label distribution shift-based and label space mismatch-based\nvariants, essentially categorizing them as a unified problem, guiding to a\ncomprehensive framework for thoroughly solving all the variants. The key\nchallenge of GUDA is developing and identifying novel target categories while\nestimating the target label distribution. To address this problem, we take\nadvantage of the powerful exploration capability of generative flow networks\nand propose an active domain adaptation algorithm named GFlowDA, which selects\ndiverse samples with probabilities proportional to a reward function. To\nenhance the exploration capability and effectively perceive the target label\ndistribution, we tailor the states and rewards, and introduce an efficient\nsolution for parent exploration and state transition. We also propose a\ntraining paradigm for GUDA called Generalized Universal Adversarial Network\n(GUAN), which involves collaborative optimization between GUAN and GFlowNet.\nTheoretical analysis highlights the importance of exploration, and extensive\nexperiments on benchmark datasets demonstrate the superiority of GFlowDA.\n","authors":["Didi Zhu","Yinchuan Li","Yunfeng Shao","Jianye Hao","Fei Wu","Kun Kuang","Jun Xiao","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2305.04466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15724v1","updated":"2023-08-30T02:56:55Z","published":"2023-08-30T02:56:55Z","title":"Background Debiased SAR Target Recognition via Causal Interventional\n Regularizer","summary":" Recent studies have utilized deep learning (DL) techniques to automatically\nextract features from synthetic aperture radar (SAR) images, which shows great\npromise for enhancing the performance of SAR automatic target recognition\n(ATR). However, our research reveals a previously overlooked issue: SAR images\nto be recognized include not only the foreground (i.e., the target), but also a\ncertain size of the background area. When a DL-model is trained exclusively on\nforeground data, its recognition performance is significantly superior to a\nmodel trained on original data that includes both foreground and background.\nThis suggests that the presence of background impedes the ability of the\nDL-model to learn additional semantic information about the target. To address\nthis issue, we construct a structural causal model (SCM) that incorporates the\nbackground as a confounder. Based on the constructed SCM, we propose a causal\nintervention based regularization method to eliminate the negative impact of\nbackground on feature semantic learning and achieve background debiased\nSAR-ATR. The proposed causal interventional regularizer can be integrated into\nany existing DL-based SAR-ATR models to mitigate the impact of background\ninterference on the feature extraction and recognition accuracy. Experimental\nresults on the Moving and Stationary Target Acquisition and Recognition (MSTAR)\ndataset indicate that the proposed method can enhance the efficiency of\nexisting DL-based methods in a plug-and-play manner.\n","authors":["Hongwei Dong","Fangzhou Han","Lingyu Si","Wenwen Qiang","Lamei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15724v1.pdf","comment":"38 pages, 8 figures"},{"id":"http://arxiv.org/abs/2304.11862v4","updated":"2023-08-30T02:55:09Z","published":"2023-04-24T07:16:54Z","title":"Universal Domain Adaptation via Compressive Attention Matching","summary":" Universal domain adaptation (UniDA) aims to transfer knowledge from the\nsource domain to the target domain without any prior knowledge about the label\nset. The challenge lies in how to determine whether the target samples belong\nto common categories. The mainstream methods make judgments based on the sample\nfeatures, which overemphasizes global information while ignoring the most\ncrucial local objects in the image, resulting in limited accuracy. To address\nthis issue, we propose a Universal Attention Matching (UniAM) framework by\nexploiting the self-attention mechanism in vision transformer to capture the\ncrucial object information. The proposed framework introduces a novel\nCompressive Attention Matching (CAM) approach to explore the core information\nby compressively representing attentions. Furthermore, CAM incorporates a\nresidual-based measurement to determine the sample commonness. By utilizing the\nmeasurement, UniAM achieves domain-wise and category-wise Common Feature\nAlignment (CFA) and Target Class Separation (TCS). Notably, UniAM is the first\nmethod utilizing the attention in vision transformer directly to perform\nclassification tasks. Extensive experiments show that UniAM outperforms the\ncurrent state-of-the-art methods on various benchmark datasets.\n","authors":["Didi Zhu","Yincuan Li","Junkun Yuan","Zexi Li","Kun Kuang","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2304.11862v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10720v3","updated":"2023-08-30T02:47:27Z","published":"2023-06-19T06:41:19Z","title":"Exploring the Relationship between Samples and Masks for Robust Defect\n Localization","summary":" Defect detection aims to detect and localize regions out of the normal\ndistribution.Previous approaches model normality and compare it with the input\nto identify defective regions, potentially limiting their generalizability.This\npaper proposes a one-stage framework that detects defective patterns directly\nwithout the modeling process.This ability is adopted through the joint efforts\nof three parties: a generative adversarial network (GAN), a newly proposed\nscaled pattern loss, and a dynamic masked cycle-consistent auxiliary network.\nExplicit information that could indicate the position of defects is\nintentionally excluded to avoid learning any direct mapping.Experimental\nresults on the texture class of the challenging MVTec AD dataset show that the\nproposed method is 2.9\\% higher than the SOTA methods in F1-Score, while\nsubstantially outperforming SOTA methods in generalizability.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10720v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10421v2","updated":"2023-08-30T02:32:08Z","published":"2023-08-21T02:13:40Z","title":"UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D\n Representation for 3D Perception in Autonomous Driving","summary":" Masked Autoencoders (MAE) play a pivotal role in learning potent\nrepresentations, delivering outstanding results across various 3D perception\ntasks essential for autonomous driving. In real-world driving scenarios, it's\ncommonplace to deploy multiple sensors for comprehensive environment\nperception. While integrating multi-modal features from these sensors can\nproduce rich and powerful features, there is a noticeable gap in MAE methods\naddressing this integration. This research delves into multi-modal Masked\nAutoencoders tailored for a unified representation space in autonomous driving,\naiming to pioneer a more efficient fusion of two distinct modalities. To\nintricately marry the semantics inherent in images with the geometric\nintricacies of LiDAR point clouds, the UniM$^2$AE is proposed. This model\nstands as a potent yet straightforward, multi-modal self-supervised\npre-training framework, mainly consisting of two designs. First, it projects\nthe features from both modalities into a cohesive 3D volume space, ingeniously\nexpanded from the bird's eye view (BEV) to include the height dimension. The\nextension makes it possible to back-project the informative features, obtained\nby fusing features from both modalities, into their native modalities to\nreconstruct the multiple masked inputs. Second, the Multi-modal 3D Interactive\nModule (MMIM) is invoked to facilitate the efficient inter-modal interaction\nduring the interaction process. Extensive experiments conducted on the nuScenes\nDataset attest to the efficacy of UniM$^2$AE, indicating enhancements in 3D\nobject detection and BEV map segmentation by 1.2\\%(NDS) and 6.5\\% (mIoU),\nrespectively. Code is available at https://github.com/hollow-503/UniM2AE.\n","authors":["Jian Zou","Tianyu Huang","Guanglei Yang","Zhenhua Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.10421v2.pdf","comment":"Code available at https://github.com/hollow-503/UniM2AE"},{"id":"http://arxiv.org/abs/2303.17895v4","updated":"2023-08-30T02:10:53Z","published":"2023-03-31T08:56:29Z","title":"EA-LSS: Edge-aware Lift-splat-shot Framework for 3D BEV Object Detection","summary":" In recent years, great progress has been made in the Lift-Splat-Shot-based\n(LSS-based) 3D object detection method. However, inaccurate depth estimation\nremains an important constraint to the accuracy of camera-only and multi-model\n3D object detection models, especially in regions where the depth changes\nsignificantly (i.e., the \"depth jump\" problem). In this paper, we proposed a\nnovel Edge-aware Lift-splat-shot (EA-LSS) framework. Specifically, edge-aware\ndepth fusion (EADF) module is proposed to alleviate the \"depth jump\" problem\nand fine-grained depth (FGD) module to further enforce refined supervision on\ndepth. Our EA-LSS framework is compatible for any LSS-based 3D object detection\nmodels, and effectively boosts their performances with negligible increment of\ninference time. Experiments on nuScenes benchmarks demonstrate that EA-LSS is\neffective in either camera-only or multi-model models. It is worth mentioning\nthat EA-LSS achieved the state-of-the-art performance on nuScenes test\nbenchmarks with mAP and NDS of 76.5% and 77.6%, respectively.\n","authors":["Haotian Hu","Fanyi Wang","Jingwen Su","Yaonong Wang","Laifeng Hu","Weiye Fang","Jingwei Xu","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.17895v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15705v1","updated":"2023-08-30T02:01:19Z","published":"2023-08-30T02:01:19Z","title":"Towards Earlier Detection of Oral Diseases On Smartphones Using Oral and\n Dental RGB Images","summary":" Oral diseases such as periodontal (gum) diseases and dental caries (cavities)\naffect billions of people across the world today. However, previous\nstate-of-the-art models have relied on X-ray images to detect oral diseases,\nmaking them inaccessible to remote monitoring, developing countries, and\ntelemedicine. To combat this overuse of X-ray imagery, we propose a lightweight\nmachine learning model capable of detecting calculus (also known as hardened\nplaque or tartar) in RGB images while running efficiently on low-end devices.\nThe model, a modified MobileNetV3-Small neural network transfer learned from\nImageNet, achieved an accuracy of 72.73% (which is comparable to\nstate-of-the-art solutions) while still being able to run on mobile devices due\nto its reduced memory requirements and processing times. A ResNet34-based model\nwas also constructed and achieved an accuracy of 81.82%. Both of these models\nwere tested on a mobile app, demonstrating their potential to limit the number\nof serious oral disease cases as their predictions can help patients schedule\nappointments earlier without the need to go to the clinic.\n","authors":["Ayush Garg","Julia Lu","Anika Maji"],"pdf_url":"https://arxiv.org/pdf/2308.15705v1.pdf","comment":"10 pages, 6 figures, 1 formula. This research was conducted as a\n mentored project performed for a college course and research program at the\n University of California Santa Barbara's Summer Research Academies program"},{"id":"http://arxiv.org/abs/2308.15005v2","updated":"2023-08-30T01:54:27Z","published":"2023-08-29T03:54:26Z","title":"Few-Shot Object Detection via Synthetic Features with Optimal Transport","summary":" Few-shot object detection aims to simultaneously localize and classify the\nobjects in an image with limited training samples. However, most existing\nfew-shot object detection methods focus on extracting the features of a few\nsamples of novel classes that lack diversity. Hence, they may not be sufficient\nto capture the data distribution. To address that limitation, in this paper, we\npropose a novel approach in which we train a generator to generate synthetic\ndata for novel classes. Still, directly training a generator on the novel class\nis not effective due to the lack of novel data. To overcome that issue, we\nleverage the large-scale dataset of base classes. Our overarching goal is to\ntrain a generator that captures the data variations of the base dataset. We\nthen transform the captured variations into novel classes by generating\nsynthetic data with the trained generator. To encourage the generator to\ncapture data variations on base classes, we propose to train the generator with\nan optimal transport loss that minimizes the optimal transport distance between\nthe distributions of real and synthetic data. Extensive experiments on two\nbenchmark datasets demonstrate that the proposed method outperforms the state\nof the art. Source code will be available.\n","authors":["Anh-Khoa Nguyen Vu","Thanh-Toan Do","Vinh-Tiep Nguyen","Tam Le","Minh-Triet Tran","Tam V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.15005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06767v3","updated":"2023-08-30T01:25:29Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v3.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.15692v1","updated":"2023-08-30T01:21:11Z","published":"2023-08-30T01:21:11Z","title":"Intriguing Properties of Diffusion Models: A Large-Scale Dataset for\n Evaluating Natural Attack Capability in Text-to-Image Generative Models","summary":" Denoising probabilistic diffusion models have shown breakthrough performance\nthat can generate more photo-realistic images or human-level illustrations than\nthe prior models such as GANs. This high image-generation capability has\nstimulated the creation of many downstream applications in various areas.\nHowever, we find that this technology is indeed a double-edged sword: We\nidentify a new type of attack, called the Natural Denoising Diffusion (NDD)\nattack based on the finding that state-of-the-art deep neural network (DNN)\nmodels still hold their prediction even if we intentionally remove their robust\nfeatures, which are essential to the human visual system (HVS), by text\nprompts. The NDD attack can generate low-cost, model-agnostic, and\ntransferrable adversarial attacks by exploiting the natural attack capability\nin diffusion models. Motivated by the finding, we construct a large-scale\ndataset, Natural Denoising Diffusion Attack (NDDA) dataset, to systematically\nevaluate the risk of the natural attack capability of diffusion models with\nstate-of-the-art text-to-image diffusion models. We evaluate the natural attack\ncapability by answering 6 research questions. Through a user study to confirm\nthe validity of the NDD attack, we find that the NDD attack can achieve an 88%\ndetection rate while being stealthy to 93% of human subjects. We also find that\nthe non-robust features embedded by diffusion models contribute to the natural\nattack capability. To confirm the model-agnostic and transferrable attack\ncapability, we perform the NDD attack against an AD vehicle and find that 73%\nof the physically printed attacks can be detected as a stop sign. We hope that\nour study and dataset can help our community to be aware of the risk of\ndiffusion models and facilitate further research toward robust DNN models.\n","authors":["Takami Sato","Justin Yue","Nanze Chen","Ningfei Wang","Qi Alfred Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15690v1","updated":"2023-08-30T01:14:32Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v1.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"},{"id":"http://arxiv.org/abs/2308.10902v2","updated":"2023-08-30T23:28:53Z","published":"2023-08-21T17:59:54Z","title":"CamP: Camera Preconditioning for Neural Radiance Fields","summary":" Neural Radiance Fields (NeRF) can be optimized to obtain high-fidelity 3D\nscene reconstructions of objects and large-scale scenes. However, NeRFs require\naccurate camera parameters as input -- inaccurate camera parameters result in\nblurry renderings. Extrinsic and intrinsic camera parameters are usually\nestimated using Structure-from-Motion (SfM) methods as a pre-processing step to\nNeRF, but these techniques rarely yield perfect estimates. Thus, prior works\nhave proposed jointly optimizing camera parameters alongside a NeRF, but these\nmethods are prone to local minima in challenging settings. In this work, we\nanalyze how different camera parameterizations affect this joint optimization\nproblem, and observe that standard parameterizations exhibit large differences\nin magnitude with respect to small perturbations, which can lead to an\nill-conditioned optimization problem. We propose using a proxy problem to\ncompute a whitening transform that eliminates the correlation between camera\nparameters and normalizes their effects, and we propose to use this transform\nas a preconditioner for the camera parameters during joint optimization. Our\npreconditioned camera optimization significantly improves reconstruction\nquality on scenes from the Mip-NeRF 360 dataset: we reduce error rates (RMSE)\nby 67% compared to state-of-the-art NeRF approaches that do not optimize for\ncameras like Zip-NeRF, and by 29% relative to state-of-the-art joint\noptimization approaches using the camera parameterization of SCNeRF. Our\napproach is easy to implement, does not significantly increase runtime, can be\napplied to a wide variety of camera parameterizations, and can\nstraightforwardly be incorporated into other NeRF-like models.\n","authors":["Keunhong Park","Philipp Henzler","Ben Mildenhall","Jonathan T. Barron","Ricardo Martin-Brualla"],"pdf_url":"https://arxiv.org/pdf/2308.10902v2.pdf","comment":"SIGGRAPH Asia 2023, Project page: https://camp-nerf.github.io"},{"id":"http://arxiv.org/abs/2308.16355v1","updated":"2023-08-30T23:03:49Z","published":"2023-08-30T23:03:49Z","title":"A Recycling Training Strategy for Medical Image Segmentation with\n Diffusion Denoising Models","summary":" Denoising diffusion models have found applications in image segmentation by\ngenerating segmented masks conditioned on images. Existing studies\npredominantly focus on adjusting model architecture or improving inference such\nas test-time sampling strategies. In this work, we focus on training strategy\nimprovements and propose a novel recycling method. During each training step, a\nsegmentation mask is first predicted given an image and a random noise. This\npredicted mask, replacing the conventional ground truth mask, is used for\ndenoising task during training. This approach can be interpreted as aligning\nthe training strategy with inference by eliminating the dependence on ground\ntruth masks for generating noisy samples. Our proposed method significantly\noutperforms standard diffusion training, self-conditioning, and existing\nrecycling strategies across multiple medical imaging data sets: muscle\nultrasound, abdominal CT, prostate MR, and brain MR. This holds true for two\nwidely adopted sampling strategies: denoising diffusion probabilistic model and\ndenoising diffusion implicit model. Importantly, existing diffusion models\noften display a declining or unstable performance during inference, whereas our\nnovel recycling consistently enhances or maintains performance. Furthermore, we\nshow for the first time that, under a fair comparison with the same network\narchitectures and computing budget, the proposed recycling-based diffusion\nmodels achieved on-par performance with non-diffusion-based supervised\ntraining. This paper summarises these quantitative results and discusses their\nvalues, with a fully reproducible JAX-based implementation, released at\nhttps://github.com/mathpluscode/ImgX-DiffSeg.\n","authors":["Yunguan Fu","Yiwen Li","Shaheer U Saeed","Matthew J Clarkson","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2308.16355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16354v1","updated":"2023-08-30T23:02:26Z","published":"2023-08-30T23:02:26Z","title":"Catalog Phrase Grounding (CPG): Grounding of Product Textual Attributes\n in Product Images for e-commerce Vision-Language Applications","summary":" We present Catalog Phrase Grounding (CPG), a model that can associate product\ntextual data (title, brands) into corresponding regions of product images\n(isolated product region, brand logo region) for e-commerce vision-language\napplications. We use a state-of-the-art modulated multimodal transformer\nencoder-decoder architecture unifying object detection and phrase-grounding. We\ntrain the model in self-supervised fashion with 2.3 million image-text pairs\nsynthesized from an e-commerce site. The self-supervision data is annotated\nwith high-confidence pseudo-labels generated with a combination of teacher\nmodels: a pre-trained general domain phrase grounding model (e.g. MDETR) and a\nspecialized logo detection model. This allows CPG, as a student model, to\nbenefit from transfer knowledge from these base models combining general-domain\nknowledge and specialized knowledge. Beyond immediate catalog phrase grounding\ntasks, we can benefit from CPG representations by incorporating them as ML\nfeatures into downstream catalog applications that require deep semantic\nunderstanding of products. Our experiments on product-brand matching, a\nchallenging e-commerce application, show that incorporating CPG representations\ninto the existing production ensemble system leads to on average 5% recall\nimprovement across all countries globally (with the largest lift of 11% in a\nsingle country) at fixed 95% precision, outperforming other alternatives\nincluding a logo detection teacher model and ResNet50.\n","authors":["Wenyi Wu","Karim Bouyarmane","Ismail Tutar"],"pdf_url":"https://arxiv.org/pdf/2308.16354v1.pdf","comment":"KDD 2022 Workshop on First Content Understanding and Generation for\n e-Commerce"},{"id":"http://arxiv.org/abs/2304.09949v2","updated":"2023-08-30T22:41:03Z","published":"2023-04-19T20:03:09Z","title":"Learning Temporal Distribution and Spatial Correlation for Universal\n Moving Object Segmentation","summary":" Universal moving object segmentation aims to provide a general model for\nvideos from all types of natural scenes, as previous approaches are usually\neffective for specific or similar scenes. In this paper, we propose a method\ncalled Learning Temporal Distribution and Spatial Correlation (LTS) that has\nthe potential to be a general solution for universal moving object\nsegmentation. In the proposed approach, the distribution from temporal pixels\nis first learned by our Defect Iterative Distribution Learning (DIDL) network\nfor a scene-independent segmentation. Then, the Stochastic Bayesian Refinement\n(SBR) Network, which learns the spatial correlation, is proposed to improve the\nbinary mask generated by the DIDL network. Benefiting from the scene\nindependence of the temporal distribution and the accuracy improvement\nresulting from the spatial correlation, the proposed approach performs well for\nalmost all videos from diverse and complex natural scenes with fixed\nparameters. Comprehensive experiments on standard datasets including LASIESTA,\nCDNet2014, BMC, SBMI2015 and 128 real world videos demonstrate the superiority\nof proposed approach compared to state-of-the-art methods with or without the\nuse of deep learning networks. To the best of our knowledge, this work has high\npotential to be a general solution for moving object segmentation in real world\nenvironments.\n","authors":["Guanfang Dong","Chenqiu Zhao","Xichen Pan","Anup Basu"],"pdf_url":"https://arxiv.org/pdf/2304.09949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03384v2","updated":"2023-08-30T22:20:50Z","published":"2023-04-06T21:29:34Z","title":"Beyond NeRF Underwater: Learning Neural Reflectance Fields for True\n Color Correction of Marine Imagery","summary":" Underwater imagery often exhibits distorted coloration as a result of\nlight-water interactions, which complicates the study of benthic environments\nin marine biology and geography. In this research, we propose an algorithm to\nrestore the true color (albedo) in underwater imagery by jointly learning the\neffects of the medium and neural scene representations. Our approach models\nwater effects as a combination of light attenuation with distance and\nbackscattered light. The proposed neural scene representation is based on a\nneural reflectance field model, which learns albedos, normals, and volume\ndensities of the underwater environment. We introduce a logistic regression\nmodel to separate water from the scene and apply distinct light physics during\ntraining. Our method avoids the need to estimate complex backscatter effects in\nwater by employing several approximations, enhancing sampling efficiency and\nnumerical stability during training. The proposed technique integrates\nunderwater light effects into a volume rendering framework with end-to-end\ndifferentiability. Experimental results on both synthetic and real-world data\ndemonstrate that our method effectively restores true color from underwater\nimagery, outperforming existing approaches in terms of color consistency.\n","authors":["Tianyi Zhang","Matthew Johnson-Roberson"],"pdf_url":"https://arxiv.org/pdf/2304.03384v2.pdf","comment":"Robotics and Automation Letters (RA-L) VOL. 8, NO. 10, OCTOBER 2023"},{"id":"http://arxiv.org/abs/2308.16325v1","updated":"2023-08-30T21:20:15Z","published":"2023-08-30T21:20:15Z","title":"Two-Stage Violence Detection Using ViTPose and Classification Models at\n Smart Airports","summary":" This study introduces an innovative violence detection framework tailored to\nthe unique requirements of smart airports, where prompt responses to violent\nsituations are crucial. The proposed framework harnesses the power of ViTPose\nfor human pose estimation. It employs a CNN - BiLSTM network to analyse spatial\nand temporal information within keypoints sequences, enabling the accurate\nclassification of violent behaviour in real time. Seamlessly integrated within\nthe SAFE (Situational Awareness for Enhanced Security framework of SAAB, the\nsolution underwent integrated testing to ensure robust performance in real\nworld scenarios. The AIRTLab dataset, characterized by its high video quality\nand relevance to surveillance scenarios, is utilized in this study to enhance\nthe model's accuracy and mitigate false positives. As airports face increased\nfoot traffic in the post pandemic era, implementing AI driven violence\ndetection systems, such as the one proposed, is paramount for improving\nsecurity, expediting response times, and promoting data informed decision\nmaking. The implementation of this framework not only diminishes the\nprobability of violent events but also assists surveillance teams in\neffectively addressing potential threats, ultimately fostering a more secure\nand protected aviation sector. Codes are available at:\nhttps://github.com/Asami-1/GDP.\n","authors":["İrem Üstek","Jay Desai","Iván López Torrecillas","Sofiane Abadou","Jinjie Wang","Quentin Fever","Sandhya Rani Kasthuri","Yang Xing","Weisi Guo","Antonios Tsourdos"],"pdf_url":"https://arxiv.org/pdf/2308.16325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01887v3","updated":"2023-08-30T21:17:43Z","published":"2022-10-04T20:14:47Z","title":"Collecting The Puzzle Pieces: Disentangled Self-Driven Human Pose\n Transfer by Permuting Textures","summary":" Human pose transfer synthesizes new view(s) of a person for a given pose.\nRecent work achieves this via self-reconstruction, which disentangles a\nperson's pose and texture information by breaking the person down into parts,\nthen recombines them for reconstruction. However, part-level disentanglement\npreserves some pose information that can create unwanted artifacts. In this\npaper, we propose Pose Transfer by Permuting Textures (PT$^2$), an approach for\nself-driven human pose transfer that disentangles pose from texture at the\npatch-level. Specifically, we remove pose from an input image by permuting\nimage patches so only texture information remains. Then we reconstruct the\ninput image by sampling from the permuted textures for patch-level\ndisentanglement. To reduce noise and recover clothing shape information from\nthe permuted patches, we employ encoders with multiple kernel sizes in a triple\nbranch network. On DeepFashion and Market-1501, PT$^2$ reports significant\ngains on automatic metrics over other self-driven methods, and even outperforms\nsome fully-supervised methods. A user study also reports images generated by\nour method are preferred in 68% of cases over self-driven approaches from prior\nwork. Code is available at https://github.com/NannanLi999/pt_square.\n","authors":["Nannan Li","Kevin J. Shih","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2210.01887v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16323v1","updated":"2023-08-30T21:06:11Z","published":"2023-08-30T21:06:11Z","title":"Software multiplataforma para a segmentação de vasos sanguíneos\n em imagens da retina","summary":" In this work, we utilize image segmentation to visually identify blood\nvessels in retinal examination images. This process is typically carried out\nmanually. However, we can employ heuristic methods and machine learning to\nautomate or at least expedite the process. In this context, we propose a\ncross-platform, open-source, and responsive software that allows users to\nmanually segment a retinal image. The purpose is to use the user-segmented\nimage to retrain machine learning algorithms, thereby enhancing future\nautomated segmentation results. Moreover, the software also incorporates and\napplies certain image filters established in the literature to improve vessel\nvisualization. We propose the first solution of this kind in the literature.\nThis is the inaugural integrated software that embodies the aforementioned\nattributes: open-source, responsive, and cross-platform. It offers a\ncomprehensive solution encompassing manual vessel segmentation, as well as the\nautomated execution of classification algorithms to refine predictive models.\n","authors":["João Henrique Pereira Machado","Gilson Adamczuk Oliveira","Érick Oliveira Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2308.16323v1.pdf","comment":"in Portuguese language. International Conference on Production\n Research - Americas 2022.\n https://www.even3.com.br/anais/foreigners_subscription_icpr_americas22/664603-software-multiplataforma-para-a-segmentacao-de-vasos-sanguineos-em-imagens-da-retina/"},{"id":"http://arxiv.org/abs/2308.16316v1","updated":"2023-08-30T20:46:45Z","published":"2023-08-30T20:46:45Z","title":"Ten Years of Generative Adversarial Nets (GANs): A survey of the\n state-of-the-art","summary":" Since their inception in 2014, Generative Adversarial Networks (GANs) have\nrapidly emerged as powerful tools for generating realistic and diverse data\nacross various domains, including computer vision and other applied areas.\nConsisting of a discriminative network and a generative network engaged in a\nMinimax game, GANs have revolutionized the field of generative modeling. In\nFebruary 2018, GAN secured the leading spot on the ``Top Ten Global\nBreakthrough Technologies List'' issued by the Massachusetts Science and\nTechnology Review. Over the years, numerous advancements have been proposed,\nleading to a rich array of GAN variants, such as conditional GAN, Wasserstein\nGAN, CycleGAN, and StyleGAN, among many others. This survey aims to provide a\ngeneral overview of GANs, summarizing the latent architecture, validation\nmetrics, and application areas of the most widely recognized variants. We also\ndelve into recent theoretical developments, exploring the profound connection\nbetween the adversarial principle underlying GAN and Jensen-Shannon divergence,\nwhile discussing the optimality characteristics of the GAN framework. The\nefficiency of GAN variants and their model architectures will be evaluated\nalong with training obstacles as well as training solutions. In addition, a\ndetailed discussion will be provided, examining the integration of GANs with\nnewly developed deep learning frameworks such as Transformers, Physics-Informed\nNeural Networks, Large Language models, and Diffusion models. Finally, we\nreveal several issues as well as future research outlines in this field.\n","authors":["Tanujit Chakraborty","Ujjwal Reddy K S","Shraddha M. Naik","Madhurima Panja","Bayapureddy Manvitha"],"pdf_url":"https://arxiv.org/pdf/2308.16316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.03022v3","updated":"2023-08-30T20:36:09Z","published":"2023-02-06T18:57:30Z","title":"SurgT challenge: Benchmark of Soft-Tissue Trackers for Robotic Surgery","summary":" This paper introduces the ``SurgT: Surgical Tracking\" challenge which was\norganised in conjunction with MICCAI 2022. There were two purposes for the\ncreation of this challenge: (1) the establishment of the first standardised\nbenchmark for the research community to assess soft-tissue trackers; and (2) to\nencourage the development of unsupervised deep learning methods, given the lack\nof annotated data in surgery. A dataset of 157 stereo endoscopic videos from 20\nclinical cases, along with stereo camera calibration parameters, have been\nprovided. Participants were assigned the task of developing algorithms to track\nthe movement of soft tissues, represented by bounding boxes, in stereo\nendoscopic videos. At the end of the challenge, the developed methods were\nassessed on a previously hidden test subset. This assessment uses benchmarking\nmetrics that were purposely developed for this challenge, to verify the\nefficacy of unsupervised deep learning algorithms in tracking soft-tissue. The\nmetric used for ranking the methods was the Expected Average Overlap (EAO)\nscore, which measures the average overlap between a tracker's and the ground\ntruth bounding boxes. Coming first in the challenge was the deep learning\nsubmission by ICVS-2Ai with a superior EAO score of 0.617. This method employs\nARFlow to estimate unsupervised dense optical flow from cropped images, using\nphotometric and regularization losses. Second, Jmees with an EAO of 0.583, uses\ndeep learning for surgical tool segmentation on top of a non-deep learning\nbaseline method: CSRT. CSRT by itself scores a similar EAO of 0.563. The\nresults from this challenge show that currently, non-deep learning methods are\nstill competitive. The dataset and benchmarking tool created for this challenge\nhave been made publicly available at https://surgt.grand-challenge.org/.\n","authors":["Joao Cartucho","Alistair Weld","Samyakh Tukra","Haozheng Xu","Hiroki Matsuzaki","Taiyo Ishikawa","Minjun Kwon","Yong Eun Jang","Kwang-Ju Kim","Gwang Lee","Bizhe Bai","Lueder Kahrs","Lars Boecking","Simeon Allmendinger","Leopold Muller","Yitong Zhang","Yueming Jin","Sophia Bano","Francisco Vasconcelos","Wolfgang Reiter","Jonas Hajek","Bruno Silva","Estevao Lima","Joao L. Vilaca","Sandro Queiros","Stamatia Giannarou"],"pdf_url":"https://arxiv.org/pdf/2302.03022v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10474v2","updated":"2023-08-30T20:28:13Z","published":"2023-05-17T17:59:16Z","title":"Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models","summary":" Despite tremendous progress in generating high-quality images using diffusion\nmodels, synthesizing a sequence of animated frames that are both photorealistic\nand temporally coherent is still in its infancy. While off-the-shelf\nbillion-scale datasets for image generation are available, collecting similar\nvideo data of the same scale is still challenging. Also, training a video\ndiffusion model is computationally much more expensive than its image\ncounterpart. In this work, we explore finetuning a pretrained image diffusion\nmodel with video data as a practical solution for the video synthesis task. We\nfind that naively extending the image noise prior to video noise prior in video\ndiffusion leads to sub-optimal performance. Our carefully designed video noise\nprior leads to substantially better performance. Extensive experimental\nvalidation shows that our model, Preserve Your Own Correlation (PYoCo), attains\nSOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It\nalso achieves SOTA video generation quality on the small-scale UCF-101\nbenchmark with a $10\\times$ smaller model using significantly less computation\nthan the prior art.\n","authors":["Songwei Ge","Seungjun Nah","Guilin Liu","Tyler Poon","Andrew Tao","Bryan Catanzaro","David Jacobs","Jia-Bin Huang","Ming-Yu Liu","Yogesh Balaji"],"pdf_url":"https://arxiv.org/pdf/2305.10474v2.pdf","comment":"ICCV 2023. Project webpage:\n https://research.nvidia.com/labs/dir/pyoco"},{"id":"http://arxiv.org/abs/2303.12743v4","updated":"2023-08-30T20:26:25Z","published":"2023-03-20T07:42:48Z","title":"DR.CPO: Diversified and Realistic 3D Augmentation via Iterative\n Construction, Random Placement, and HPR Occlusion","summary":" In autonomous driving, data augmentation is commonly used for improving 3D\nobject detection. The most basic methods include insertion of copied objects\nand rotation and scaling of the entire training frame. Numerous variants have\nbeen developed as well. The existing methods, however, are considerably limited\nwhen compared to the variety of the real world possibilities. In this work, we\ndevelop a diversified and realistic augmentation method that can flexibly\nconstruct a whole-body object, freely locate and rotate the object, and apply\nself-occlusion and external-occlusion accordingly. To improve the diversity of\nthe whole-body object construction, we develop an iterative method that\nstochastically combines multiple objects observed from the real world into a\nsingle object. Unlike the existing augmentation methods, the constructed\nobjects can be randomly located and rotated in the training frame because\nproper occlusions can be reflected to the whole-body objects in the final step.\nFinally, proper self-occlusion at each local object level and\nexternal-occlusion at the global frame level are applied using the Hidden Point\nRemoval (HPR) algorithm that is computationally efficient. HPR is also used for\nadaptively controlling the point density of each object according to the\nobject's distance from the LiDAR. Experiment results show that the proposed\nDR.CPO algorithm is data-efficient and model-agnostic without incurring any\ncomputational overhead. Also, DR.CPO can improve mAP performance by 2.08% when\ncompared to the best 3D detection result known for KITTI dataset. The code is\navailable at https://github.com/SNU-DRL/DRCPO.git\n","authors":["Jungwook Shin","Jaeill Kim","Kyungeun Lee","Hyunghun Cho","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2303.12743v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06720v2","updated":"2023-08-30T20:12:12Z","published":"2023-04-13T17:59:55Z","title":"Expressive Text-to-Image Generation with Rich Text","summary":" Plain text has become a prevalent interface for text-to-image synthesis.\nHowever, its limited customization options hinder users from accurately\ndescribing desired outputs. For example, plain text makes it hard to specify\ncontinuous quantities, such as the precise RGB color value or importance of\neach word. Furthermore, creating detailed text prompts for complex scenes is\ntedious for humans to write and challenging for text encoders to interpret. To\naddress these challenges, we propose using a rich-text editor supporting\nformats such as font style, size, color, and footnote. We extract each word's\nattributes from rich text to enable local style control, explicit token\nreweighting, precise color rendering, and detailed region synthesis. We achieve\nthese capabilities through a region-based diffusion process. We first obtain\neach word's region based on attention maps of a diffusion process using plain\ntext. For each region, we enforce its text attributes by creating\nregion-specific detailed prompts and applying region-specific guidance, and\nmaintain its fidelity against plain-text generation through region-based\ninjections. We present various examples of image generation from rich text and\ndemonstrate that our method outperforms strong baselines with quantitative\nevaluations.\n","authors":["Songwei Ge","Taesung Park","Jun-Yan Zhu","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2304.06720v2.pdf","comment":"ICCV 2023. Project webpage: https://rich-text-to-image.github.io/"},{"id":"http://arxiv.org/abs/2304.13455v4","updated":"2023-08-30T19:44:41Z","published":"2023-04-26T11:27:34Z","title":"From Chaos Comes Order: Ordering Event Representations for Object\n Recognition and Detection","summary":" Today, state-of-the-art deep neural networks that process events first\nconvert them into dense, grid-like input representations before using an\noff-the-shelf network. However, selecting the appropriate representation for\nthe task traditionally requires training a neural network for each\nrepresentation and selecting the best one based on the validation score, which\nis very time-consuming. This work eliminates this bottleneck by selecting\nrepresentations based on the Gromov-Wasserstein Discrepancy (GWD) between raw\nevents and their representation. It is about 200 times faster to compute than\ntraining a neural network and preserves the task performance ranking of event\nrepresentations across multiple representations, network backbones, datasets,\nand tasks. Thus finding representations with high task scores is equivalent to\nfinding representations with a low GWD. We use this insight to, for the first\ntime, perform a hyperparameter search on a large family of event\nrepresentations, revealing new and powerful representations that exceed the\nstate-of-the-art. Our optimized representations outperform existing\nrepresentations by 1.7 mAP on the 1 Mpx dataset and 0.3 mAP on the Gen1\ndataset, two established object detection benchmarks, and reach a 3.8% higher\nclassification score on the mini N-ImageNet benchmark. Moreover, we outperform\nstate-of-the-art by 2.1 mAP on Gen1 and state-of-the-art feed-forward methods\nby 6.0 mAP on the 1 Mpx datasets. This work opens a new unexplored field of\nexplicit representation optimization for event-based learning.\n","authors":["Nikola Zubić","Daniel Gehrig","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2304.13455v4.pdf","comment":"15 pages, 11 figures, 2 tables, ICCV 2023 Camera Ready paper"},{"id":"http://arxiv.org/abs/2211.00646v3","updated":"2023-08-30T19:25:14Z","published":"2022-11-01T00:40:09Z","title":"Learning Melanocytic Cell Masks from Adjacent Stained Tissue","summary":" Melanoma is one of the most aggressive forms of skin cancer, causing a large\nproportion of skin cancer deaths. However, melanoma diagnoses by pathologists\nshows low interrater reliability. As melanoma is a cancer of the melanocyte,\nthere is a clear need to develop a melanocytic cell segmentation tool that is\nagnostic to pathologist variability and automates pixel-level annotation.\nGigapixel-level pathologist labeling, however, is impractical. Herein, we\npropose a means to train deep neural networks for melanocytic cell segmentation\nfrom hematoxylin and eosin (H&E) stained sections and paired\nimmunohistochemistry (IHC) of adjacent tissue sections, achieving a mean IOU of\n0.64 despite imperfect ground-truth labels.\n","authors":["Mikio Tada","Ursula E. Lang","Iwei Yeh","Maria L. Wei","Michael J. Keiser"],"pdf_url":"https://arxiv.org/pdf/2211.00646v3.pdf","comment":"Accepted at Medical Image Learning with Limited & Noisy Data\n Workshop, Medical Image Computing and Computer Assisted Interventions\n (MICCAI) 2022"},{"id":"http://arxiv.org/abs/2308.16280v1","updated":"2023-08-30T19:13:23Z","published":"2023-08-30T19:13:23Z","title":"A reinforcement learning based construction material supply strategy\n using robotic crane and computer vision for building reconstruction after an\n earthquake","summary":" After an earthquake, it is particularly important to provide the necessary\nresources on site because a large number of infrastructures need to be repaired\nor newly constructed. Due to the complex construction environment after the\ndisaster, there are potential safety hazards for human labors working in this\nenvironment. With the advancement of robotic technology and artificial\nintelligent (AI) algorithms, smart robotic technology is the potential solution\nto provide construction resources after an earthquake. In this paper, the\nrobotic crane with advanced AI algorithms is proposed to provide resources for\ninfrastructure reconstruction after an earthquake. The proximal policy\noptimization (PPO), a reinforcement learning (RL) algorithm, is implemented for\n3D lift path planning when transporting the construction materials. The state\nand reward function are designed in detail for RL model training. Two models\nare trained through a loading task in different environments by using PPO\nalgorithm, one considering the influence of obstacles and the other not\nconsidering obstacles. Then, the two trained models are compared and evaluated\nthrough an unloading task and a loading task in simulation environments. For\neach task, two different cases are considered. One is that there is no obstacle\nbetween the initial position where the construction material is lifted and the\ntarget position, and the other is that there are obstacles between the initial\nposition and the target position. The results show that the model that\nconsidering the obstacles during training can generate proper actions for the\nrobotic crane to execute so that the crane can automatically transport the\nconstruction materials to the desired location with swing suppression, short\ntime consumption and collision avoidance.\n","authors":["Yifei Xiao","T. Y. Yang","Xiao Pan","Fan Xie","Zhongwei Chen"],"pdf_url":"https://arxiv.org/pdf/2308.16280v1.pdf","comment":"12 pages, 7 figures, accepted in the Canadian Conference - Pacific\n Conference on Earthquake Engineering 2023, Vancouver, British Columbia"},{"id":"http://arxiv.org/abs/2308.16278v1","updated":"2023-08-30T19:09:56Z","published":"2023-08-30T19:09:56Z","title":"Autonomous damage assessment of structural columns using low-cost micro\n aerial vehicles and multi-view computer vision","summary":" Structural columns are the crucial load-carrying components of buildings and\nbridges. Early detection of column damage is important for the assessment of\nthe residual performance and the prevention of system-level collapse. This\nresearch proposes an innovative end-to-end micro aerial vehicles (MAVs)-based\napproach to automatically scan and inspect columns. First, an MAV-based\nautomatic image collection method is proposed. The MAV is programmed to sense\nthe structural columns and their surrounding environment. During the\nnavigation, the MAV first detects and approaches the structural columns. Then,\nit starts to collect image data at multiple viewpoints around every detected\ncolumn. Second, the collected images will be used to assess the damage types\nand damage locations. Third, the damage state of the structural column will be\ndetermined by fusing the evaluation outcomes from multiple camera views. In\nthis study, reinforced concrete (RC) columns are selected to demonstrate the\neffectiveness of the approach. Experimental results indicate that the proposed\nMAV-based inspection approach can effectively collect images from multiple\nviewing angles, and accurately assess critical RC column damages. The approach\nimproves the level of autonomy during the inspection. In addition, the\nevaluation outcomes are more comprehensive than the existing 2D vision methods.\nThe concept of the proposed inspection approach can be extended to other\nstructural columns such as bridge piers.\n","authors":["Sina Tavasoli","Xiao Pan","T. Y. Yang","Saudah Gazi","Mohsen Azimi"],"pdf_url":"https://arxiv.org/pdf/2308.16278v1.pdf","comment":"12 pages, 11 figures, accepted in the Canadian Conference - Pacific\n Conference on Earthquake Engineering 2023, Vancouver, British Columbia"},{"id":"http://arxiv.org/abs/2308.16274v1","updated":"2023-08-30T19:04:34Z","published":"2023-08-30T19:04:34Z","title":"Learning Diverse Features in Vision Transformers for Improved\n Generalization","summary":" Deep learning models often rely only on a small set of features even when\nthere is a rich set of predictive signals in the training data. This makes\nmodels brittle and sensitive to distribution shifts. In this work, we first\nexamine vision transformers (ViTs) and find that they tend to extract robust\nand spurious features with distinct attention heads. As a result of this\nmodularity, their performance under distribution shifts can be significantly\nimproved at test time by pruning heads corresponding to spurious features,\nwhich we demonstrate using an \"oracle selection\" on validation data. Second, we\npropose a method to further enhance the diversity and complementarity of the\nlearned features by encouraging orthogonality of the attention heads' input\ngradients. We observe improved out-of-distribution performance on diagnostic\nbenchmarks (MNIST-CIFAR, Waterbirds) as a consequence of the enhanced diversity\nof features and the pruning of undesirable heads.\n","authors":["Armand Mihai Nicolicioiu","Andrei Liviu Nicolicioiu","Bogdan Alexe","Damien Teney"],"pdf_url":"https://arxiv.org/pdf/2308.16274v1.pdf","comment":"2023 ICML Workshop on Spurious Correlations, Invariance and Stability"},{"id":"http://arxiv.org/abs/2308.16271v1","updated":"2023-08-30T19:02:17Z","published":"2023-08-30T19:02:17Z","title":"Emergence of Segmentation with Minimalistic White-Box Transformers","summary":" Transformer-like models for vision tasks have recently proven effective for a\nwide range of downstream applications such as segmentation and detection.\nPrevious works have shown that segmentation properties emerge in vision\ntransformers (ViTs) trained using self-supervised methods such as DINO, but not\nin those trained on supervised classification tasks. In this study, we probe\nwhether segmentation emerges in transformer-based models solely as a result of\nintricate self-supervised learning mechanisms, or if the same emergence can be\nachieved under much broader conditions through proper design of the model\narchitecture. Through extensive experimental results, we demonstrate that when\nemploying a white-box transformer-like architecture known as CRATE, whose\ndesign explicitly models and pursues low-dimensional structures in the data\ndistribution, segmentation properties, at both the whole and parts levels,\nalready emerge with a minimalistic supervised training recipe. Layer-wise\nfiner-grained analysis reveals that the emergent properties strongly\ncorroborate the designed mathematical functions of the white-box network. Our\nresults suggest a path to design white-box foundation models that are\nsimultaneously highly performant and mathematically fully interpretable. Code\nis at \\url{https://github.com/Ma-Lab-Berkeley/CRATE}.\n","authors":["Yaodong Yu","Tianzhe Chu","Shengbang Tong","Ziyang Wu","Druv Pai","Sam Buchanan","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2308.16271v1.pdf","comment":"Code: https://github.com/Ma-Lab-Berkeley/CRATE"},{"id":"http://arxiv.org/abs/2308.16269v1","updated":"2023-08-30T18:58:32Z","published":"2023-08-30T18:58:32Z","title":"Can Prompt Learning Benefit Radiology Report Generation?","summary":" Radiology report generation aims to automatically provide clinically\nmeaningful descriptions of radiology images such as MRI and X-ray. Although\ngreat success has been achieved in natural scene image captioning tasks,\nradiology report generation remains challenging and requires prior medical\nknowledge. In this paper, we propose PromptRRG, a method that utilizes prompt\nlearning to activate a pretrained model and incorporate prior knowledge. Since\nprompt learning for radiology report generation has not been explored before,\nwe begin with investigating prompt designs and categorise them based on varying\nlevels of knowledge: common, domain-specific and disease-enriched prompts.\nAdditionally, we propose an automatic prompt learning mechanism to alleviate\nthe burden of manual prompt engineering. This is the first work to\nsystematically examine the effectiveness of prompt learning for radiology\nreport generation. Experimental results on the largest radiology report\ngeneration benchmark, MIMIC-CXR, demonstrate that our proposed method achieves\nstate-of-the-art performance. Code will be available upon the acceptance.\n","authors":["Jun Wang","Lixing Zhu","Abhir Bhalerao","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2308.16269v1.pdf","comment":"8 pages with 6 pages supplementary file"},{"id":"http://arxiv.org/abs/2202.04053v3","updated":"2023-08-30T18:41:01Z","published":"2022-02-08T18:36:52Z","title":"DALL-Eval: Probing the Reasoning Skills and Social Biases of\n Text-to-Image Generation Models","summary":" Recently, DALL-E, a multimodal transformer language model, and its variants,\nincluding diffusion models, have shown high-quality text-to-image generation\ncapabilities. However, despite the realistic image generation results, there\nhas not been a detailed analysis of how to evaluate such models. In this work,\nwe investigate the visual reasoning capabilities and social biases of different\ntext-to-image models, covering both multimodal transformer language models and\ndiffusion models. First, we measure three visual reasoning skills: object\nrecognition, object counting, and spatial relation understanding. For this, we\npropose PaintSkills, a compositional diagnostic evaluation dataset that\nmeasures these skills. Despite the high-fidelity image generation capability, a\nlarge gap exists between the performance of recent models and the upper bound\naccuracy in object counting and spatial relation understanding skills. Second,\nwe assess the gender and skin tone biases by measuring the gender/skin tone\ndistribution of generated images across various professions and attributes. We\ndemonstrate that recent text-to-image generation models learn specific biases\nabout gender and skin tone from web image-text pairs. We hope our work will\nhelp guide future progress in improving text-to-image generation models on\nvisual reasoning skills and learning socially unbiased representations. Code\nand data: https://github.com/j-min/DallEval\n","authors":["Jaemin Cho","Abhay Zala","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2202.04053v3.pdf","comment":"ICCV 2023 (34 pages; see appendix for version changelog)"},{"id":"http://arxiv.org/abs/2211.11827v2","updated":"2023-08-30T18:39:25Z","published":"2022-11-21T19:47:59Z","title":"High-Perceptual Quality JPEG Decoding via Posterior Sampling","summary":" JPEG is arguably the most popular image coding format, achieving high\ncompression ratios via lossy quantization that may create visual artifacts\ndegradation. Numerous attempts to remove these artifacts were conceived over\nthe years, and common to most of these is the use of deterministic\npost-processing algorithms that optimize some distortion measure (e.g., PSNR,\nSSIM). In this paper we propose a different paradigm for JPEG artifact\ncorrection: Our method is stochastic, and the objective we target is high\nperceptual quality -- striving to obtain sharp, detailed and visually pleasing\nreconstructed images, while being consistent with the compressed input. These\ngoals are achieved by training a stochastic conditional generator (conditioned\non the compressed input), accompanied by a theoretically well-founded loss\nterm, resulting in a sampler from the posterior distribution. Our solution\noffers a diverse set of plausible and fast reconstructions for a given input\nwith perfect consistency. We demonstrate our scheme's unique properties and its\nsuperiority to a variety of alternative methods on the FFHQ and ImageNet\ndatasets.\n","authors":["Sean Man","Guy Ohayon","Theo Adrai","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2211.11827v2.pdf","comment":"Presented in NTIRE workshop as part of CVPR 2023"},{"id":"http://arxiv.org/abs/2308.16258v1","updated":"2023-08-30T18:31:51Z","published":"2023-08-30T18:31:51Z","title":"Robust Principles: Architectural Design Principles for Adversarially\n Robust CNNs","summary":" Our research aims to unify existing works' diverging opinions on how\narchitectural components affect the adversarial robustness of CNNs. To\naccomplish our goal, we synthesize a suite of three generalizable robust\narchitectural design principles: (a) optimal range for depth and width\nconfigurations, (b) preferring convolutional over patchify stem stage, and (c)\nrobust residual block design through adopting squeeze and excitation blocks and\nnon-parametric smooth activation functions. Through extensive experiments\nacross a wide spectrum of dataset scales, adversarial training methods, model\nparameters, and network design spaces, our principles consistently and markedly\nimprove AutoAttack accuracy: 1-3 percentage points (pp) on CIFAR-10 and\nCIFAR-100, and 4-9 pp on ImageNet. The code is publicly available at\nhttps://github.com/poloclub/robust-principles.\n","authors":["ShengYun Peng","Weilin Xu","Cory Cornelius","Matthew Hull","Kevin Li","Rahul Duggal","Mansi Phute","Jason Martin","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2308.16258v1.pdf","comment":"Published at BMVC'23"},{"id":"http://arxiv.org/abs/2308.16246v1","updated":"2023-08-30T18:07:30Z","published":"2023-08-30T18:07:30Z","title":"Active Neural Mapping","summary":" We address the problem of active mapping with a continually-learned neural\nscene representation, namely Active Neural Mapping. The key lies in actively\nfinding the target space to be explored with efficient agent movement, thus\nminimizing the map uncertainty on-the-fly within a previously unseen\nenvironment. In this paper, we examine the weight space of the\ncontinually-learned neural field, and show empirically that the neural\nvariability, the prediction robustness against random weight perturbation, can\nbe directly utilized to measure the instant uncertainty of the neural map.\nTogether with the continuous geometric information inherited in the neural map,\nthe agent can be guided to find a traversable path to gradually gain knowledge\nof the environment. We present for the first time an active mapping system with\na coordinate-based implicit neural representation for online scene\nreconstruction. Experiments in the visually-realistic Gibson and Matterport3D\nenvironment demonstrate the efficacy of the proposed method.\n","authors":["Zike Yan","Haoxiang Yang","Hongbin Zha"],"pdf_url":"https://arxiv.org/pdf/2308.16246v1.pdf","comment":"ICCV 2023, project page:\n https://zikeyan.github.io/active-INR/index.html"},{"id":"http://arxiv.org/abs/2308.16215v1","updated":"2023-08-30T16:44:38Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control","summary":" Lossy video compression is commonly used when transmitting and storing video\ndata. Unified video codecs (e.g., H.264 or H.265) remain the \\emph{de facto}\nstandard, despite the availability of advanced (neural) compression approaches.\nTransmitting videos in the face of dynamic network bandwidth conditions\nrequires video codecs to adapt to vastly different compression strengths. Rate\ncontrol modules augment the codec's compression such that bandwidth constraints\nare satisfied and video distortion is minimized. While, both standard video\ncodes and their rate control modules are developed to minimize video distortion\nw.r.t. human quality assessment, preserving the downstream performance of deep\nvision models is not considered. In this paper, we present the first end-to-end\nlearnable deep video codec control considering both bandwidth constraints and\ndownstream vision performance, while not breaking existing standardization. We\ndemonstrate for two common vision tasks (semantic segmentation and optical flow\nestimation) and on two different datasets that our deep codec control better\npreserves downstream performance than using 2-pass average bit rate control\nwhile meeting dynamic bandwidth constraints and adhering to standardizations.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v1.pdf","comment":"22 pages, 26 figures, 6 tables"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.12767v2","updated":"2023-08-30T15:52:34Z","published":"2023-08-24T13:14:49Z","title":"On the Consistency of Average Embeddings for Item Recommendation","summary":" A prevalent practice in recommender systems consists in averaging item\nembeddings to represent users or higher-level concepts in the same embedding\nspace. This paper investigates the relevance of such a practice. For this\npurpose, we propose an expected precision score, designed to measure the\nconsistency of an average embedding relative to the items used for its\nconstruction. We subsequently analyze the mathematical expression of this score\nin a theoretical setting with specific assumptions, as well as its empirical\nbehavior on real-world data from music streaming services. Our results\nemphasize that real-world averages are less consistent for recommendation,\nwhich paves the way for future research to better align real-world embeddings\nwith assumptions from our theoretical setting.\n","authors":["Walid Bendada","Guillaume Salha-Galvan","Romain Hennequin","Thomas Bouabça","Tristan Cazenave"],"pdf_url":"https://arxiv.org/pdf/2308.12767v2.pdf","comment":"17th ACM Conference on Recommender Systems (RecSys 2023)"},{"id":"http://arxiv.org/abs/2305.17926v2","updated":"2023-08-30T13:22:35Z","published":"2023-05-29T07:41:03Z","title":"Large Language Models are not Fair Evaluators","summary":" In this paper, we uncover a systematic bias in the evaluation paradigm of\nadopting large language models~(LLMs), e.g., GPT-4, as a referee to score and\ncompare the quality of responses generated by candidate models. We find that\nthe quality ranking of candidate responses can be easily hacked by simply\naltering their order of appearance in the context. This manipulation allows us\nto skew the evaluation result, making one model appear considerably superior to\nthe other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries\nwith ChatGPT as an evaluator. To address this issue, we propose a calibration\nframework with three simple yet effective strategies: 1) Multiple Evidence\nCalibration, which requires the evaluator model to generate multiple evaluation\nevidence before assigning ratings; 2) Balanced Position Calibration, which\naggregates results across various orders to determine the final score; 3)\nHuman-in-the-Loop Calibration, which introduces a balanced position diversity\nentropy to measure the difficulty of each example and seeks human assistance\nwhen needed. We also manually annotate the \"win/tie/lose\" outcomes of responses\nfrom ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and\nextensive experiments demonstrate that our approach successfully mitigates\nevaluation bias, resulting in closer alignment with human judgments. We release\nour code and human annotation at \\url{https://github.com/i-Eval/FairEval} to\nfacilitate future research.\n","authors":["Peiyi Wang","Lei Li","Liang Chen","Zefan Cai","Dawei Zhu","Binghuai Lin","Yunbo Cao","Qi Liu","Tianyu Liu","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2305.17926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15980v1","updated":"2023-08-30T12:09:18Z","published":"2023-08-30T12:09:18Z","title":"Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems","summary":" In sequential recommendation, multi-modal information (e.g., text or image)\ncan provide a more comprehensive view of an item's profile. The optimal stage\n(early or late) to fuse modality features into item representations is still\ndebated. We propose a graph-based approach (named MMSR) to fuse modality\nfeatures in an adaptive order, enabling each modality to prioritize either its\ninherent sequential nature or its interplay with other modalities. MMSR\nrepresents each user's history as a graph, where the modality features of each\nitem in a user's history sequence are denoted by cross-linked nodes. The edges\nbetween homogeneous nodes represent intra-modality sequential relationships,\nand the ones between heterogeneous nodes represent inter-modality\ninterdependence relationships. During graph propagation, MMSR incorporates dual\nattention, differentiating homogeneous and heterogeneous neighbors. To\nadaptively assign nodes with distinct fusion orders, MMSR allows each node's\nrepresentation to be asynchronously updated through an update gate. In\nscenarios where modalities exhibit stronger sequential relationships, the\nupdate gate prioritizes updates among homogeneous nodes. Conversely, when the\ninterdependent relationships between modalities are more pronounced, the update\ngate prioritizes updates among heterogeneous nodes. Consequently, MMSR\nestablishes a fusion order that spans a spectrum from early to late modality\nfusion. In experiments across six datasets, MMSR consistently outperforms\nstate-of-the-art models, and our graph propagation methods surpass other graph\nneural networks. Additionally, MMSR naturally manages missing modalities.\n","authors":["Hengchang Hu","Wei Guo","Yong Liu","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2308.15980v1.pdf","comment":"CIKM'2023"},{"id":"http://arxiv.org/abs/2308.15968v1","updated":"2023-08-30T11:45:35Z","published":"2023-08-30T11:45:35Z","title":"Denoising Attention for Query-aware User Modeling in Personalized Search","summary":" The personalization of search results has gained increasing attention in the\npast few years, thanks to the development of Neural Networks-based approaches\nfor Information Retrieval and the importance of personalization in many search\nscenarios. Recent works have proposed to build user models at query time by\nleveraging the Attention mechanism, which allows weighing the contribution of\nthe user-related information w.r.t. the current query. This approach allows\ntaking into account the diversity of the user's interests by giving more\nimportance to those related to the current search performed by the user.\n In this paper, we first discuss some shortcomings of the standard Attention\nformulation when employed for personalization. In particular, we focus on\nissues related to its normalization mechanism and its inability to entirely\nfilter out noisy user-related information. Then, we introduce the Denoising\nAttention mechanism: an Attention variant that directly tackles the above\nshortcomings by adopting a robust normalization scheme and introducing a\nfiltering mechanism. The reported experimental evaluation shows the benefits of\nthe proposed approach over other Attention-based variants.\n","authors":["Elias Bassani","Pranav Kasela","Gabriella Pasi"],"pdf_url":"https://arxiv.org/pdf/2308.15968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15823v1","updated":"2023-08-30T07:53:27Z","published":"2023-08-30T07:53:27Z","title":"DRGame: Diversified Recommendation for Multi-category Video Games with\n Balanced Implicit Preferences","summary":" The growing popularity of subscription services in video game consumption has\nemphasized the importance of offering diversified recommendations. Providing\nusers with a diverse range of games is essential for ensuring continued\nengagement and fostering long-term subscriptions. However, existing\nrecommendation models face challenges in effectively handling highly imbalanced\nimplicit feedback in gaming interactions. Additionally, they struggle to take\ninto account the distinctive characteristics of multiple categories and the\nlatent user interests associated with these categories. In response to these\nchallenges, we propose a novel framework, named DRGame, to obtain diversified\nrecommendation. It is centered on multi-category video games, consisting of two\n{components}: Balance-driven Implicit Preferences Learning for data\npre-processing and Clustering-based Diversified Recommendation {Module} for\nfinal prediction. The first module aims to achieve a balanced representation of\nimplicit feedback in game time, thereby discovering a comprehensive view of\nplayer interests across different categories. The second module adopts\ncategory-aware representation learning to cluster and select players and games\nbased on balanced implicit preferences, and then employs asymmetric neighbor\naggregation to achieve diversified recommendations. Experimental results on a\nreal-world dataset demonstrate the superiority of our proposed method over\nexisting approaches in terms of game diversity recommendations.\n","authors":["Kangzhe Liu","Jianghong Ma","Shanshan Feng","Haijun Zhang","Zhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15813v1","updated":"2023-08-30T07:36:12Z","published":"2023-08-30T07:36:12Z","title":"Knowledge-grounded Natural Language Recommendation Explanation","summary":" Explanations accompanied by a recommendation can assist users in\nunderstanding the decision made by recommendation systems, which in turn\nincreases a user's confidence and trust in the system. Recently, research has\nfocused on generating natural language explanations in a human-readable format.\nThus far, the proposed approaches leverage item reviews written by users, which\nare often subjective, sparse in language, and unable to account for new items\nthat have not been purchased or reviewed before. Instead, we aim to generate\nfact-grounded recommendation explanations that are objectively described with\nitem features while implicitly considering a user's preferences, based on the\nuser's purchase history. To achieve this, we propose a knowledge graph (KG)\napproach to natural language explainable recommendation. Our approach draws on\nuser-item features through a novel collaborative filtering-based KG\nrepresentation to produce fact-grounded, personalized explanations, while\njointly learning user-item representations for recommendation scoring.\nExperimental results show that our approach consistently outperforms previous\nstate-of-the-art models on natural language explainable recommendation.\n","authors":["Anthony Colas","Jun Araki","Zhengyu Zhou","Bingqing Wang","Zhe Feng"],"pdf_url":"https://arxiv.org/pdf/2308.15813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06566v3","updated":"2023-08-30T06:46:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10028v2","updated":"2023-08-30T06:33:32Z","published":"2023-08-19T14:25:59Z","title":"Voucher Abuse Detection with Prompt-based Fine-tuning on Graph Neural\n Networks","summary":" Voucher abuse detection is an important anomaly detection problem in\nE-commerce. While many GNN-based solutions have emerged, the supervised\nparadigm depends on a large quantity of labeled data. A popular alternative is\nto adopt self-supervised pre-training using label-free data, and further\nfine-tune on a downstream task with limited labels. Nevertheless, the\n\"pre-train, fine-tune\" paradigm is often plagued by the objective gap between\npre-training and downstream tasks. Hence, we propose VPGNN, a prompt-based\nfine-tuning framework on GNNs for voucher abuse detection. We design a novel\ngraph prompting function to reformulate the downstream task into a similar\ntemplate as the pretext task in pre-training, thereby narrowing the objective\ngap. Extensive experiments on both proprietary and public datasets demonstrate\nthe strength of VPGNN in both few-shot and semi-supervised scenarios. Moreover,\nan online deployment of VPGNN in a production environment shows a 23.4%\nimprovement over two existing deployed models.\n","authors":["Zhihao Wen","Yuan Fang","Yihan Liu","Yang Guo","Shuji Hao"],"pdf_url":"https://arxiv.org/pdf/2308.10028v2.pdf","comment":"7 pages, Accepted by CIKM23 Applied Research Track"},{"id":"http://arxiv.org/abs/2308.15703v1","updated":"2023-08-30T01:56:57Z","published":"2023-08-30T01:56:57Z","title":"Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling\n Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate\n Prediction","summary":" Spatial-temporal information has been proven to be of great significance for\nclick-through rate prediction tasks in online Location-Based Services (LBS),\nespecially in mainstream food ordering platforms such as DoorDash, Uber Eats,\nMeituan, and Ele.me. Modeling user spatial-temporal preferences with sequential\nbehavior data has become a hot topic in recommendation systems and online\nadvertising. However, most of existing methods either lack the representation\nof rich spatial-temporal information or only handle user behaviors with limited\nlength, e.g. 100. In this paper, we tackle these problems by designing a new\nspatial-temporal modeling paradigm named Fragment and Integrate Network (FIN).\nFIN consists of two networks: (i) Fragment Network (FN) extracts Multiple\nSub-Sequences (MSS) from lifelong sequential behavior data, and captures the\nspecific spatial-temporal representation by modeling each MSS respectively.\nHere both a simplified attention and a complicated attention are adopted to\nbalance the performance gain and resource consumption. (ii) Integrate Network\n(IN) builds a new integrated sequence by utilizing spatial-temporal interaction\non MSS and captures the comprehensive spatial-temporal representation by\nmodeling the integrated sequence with a complicated attention. Both public\ndatasets and production datasets have demonstrated the accuracy and scalability\nof FIN. Since 2022, FIN has been fully deployed in the recommendation\nadvertising system of Ele.me, one of the most popular online food ordering\nplatforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and\n7.3% increase on Revenue Per Mille (RPM).\n","authors":["Jun Li","Jingjian Wang","Hongwei Wang","Xing Deng","Jielong Chen","Bing Cao","Zekun Wang","Guanjie Xu","Ge Zhang","Feng Shi","Hualei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15703v1.pdf","comment":"Accepted by CIKM 2023 Applied Research Paper"},{"id":"http://arxiv.org/abs/2308.15701v1","updated":"2023-08-30T01:54:48Z","published":"2023-08-30T01:54:48Z","title":"A Survey on Multi-Behavior Sequential Recommendation","summary":" Recommender systems is set up to address the issue of information overload in\ntraditional information retrieval systems, which is focused on recommending\ninformation that is of most interest to users from massive information.\nGenerally, there is a sequential nature and heterogeneity to the behavior of a\nperson interacting with a system, leading to the proposal of multi-behavior\nsequential recommendation (MBSR). MBSR is a relatively new and worthy direction\nfor in-depth research, which can achieve state-of-the-art recommendation\nthrough suitable modeling, and some related works have been proposed. This\nsurvey aims to shed light on the MBSR problem. Firstly, we introduce MBSR in\ndetail, including its problem definition, application scenarios and challenges\nfaced. Secondly, we detail the classification of MBSR, including\nneighborhood-based methods, matrix factorization-based methods and deep\nlearning-based methods, where we further classify the deep learning-based\nmethods into different learning architectures based on RNN, GNN, Transformer,\nand generic architectures as well as architectures that integrate hybrid\ntechniques. In each method, we present related works based on the data\nperspective and the modeling perspective, as well as analyze the strengths,\nweaknesses and features of these works. Finally, we discuss some promising\nfuture research directions to address the challenges and improve the current\nstatus of MBSR.\n","authors":["Xiaoqing Chen","Zhitao Li","Weike Pan","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2308.15701v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.15470v2","updated":"2023-08-30T17:59:52Z","published":"2023-08-29T17:50:27Z","title":"Policy composition in reinforcement learning via multi-objective policy\n optimization","summary":" We enable reinforcement learning agents to learn successful behavior policies\nby utilizing relevant pre-existing teacher policies. The teacher policies are\nintroduced as objectives, in addition to the task objective, in a\nmulti-objective policy optimization setting. Using the Multi-Objective Maximum\na Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show\nthat teacher policies can help speed up learning, particularly in the absence\nof shaping rewards. In two domains with continuous observation and action\nspaces, our agents successfully compose teacher policies in sequence and in\nparallel, and are also able to further extend the policies of the teachers in\norder to solve the task.\n Depending on the specified combination of task and teacher(s), teacher(s) may\nnaturally act to limit the final performance of an agent. The extent to which\nagents are required to adhere to teacher policies are determined by\nhyperparameters which determine both the effect of teachers on learning speed\nand the eventual performance of the agent on the task. In the humanoid domain\n(Tassa et al. 2018), we also equip agents with the ability to control the\nselection of teachers. With this ability, agents are able to meaningfully\ncompose from the teacher policies to achieve a superior task reward on the walk\ntask than in cases without access to the teacher policies. We show the\nresemblance of composed task policies with the corresponding teacher policies\nthrough videos.\n","authors":["Shruti Mishra","Ankit Anand","Jordan Hoffmann","Nicolas Heess","Martin Riedmiller","Abbas Abdolmaleki","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2308.15470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16157v1","updated":"2023-08-30T17:22:11Z","published":"2023-08-30T17:22:11Z","title":"Algebraic, Topological, and Mereological Foundations of Existential\n Granules","summary":" In this research, new concepts of existential granules that determine\nthemselves are invented, and are characterized from algebraic, topological, and\nmereological perspectives. Existential granules are those that determine\nthemselves initially, and interact with their environment subsequently.\nExamples of the concept, such as those of granular balls, though inadequately\ndefined, algorithmically established, and insufficiently theorized in earlier\nworks by others, are already used in applications of rough sets and soft\ncomputing. It is shown that they fit into multiple theoretical frameworks\n(axiomatic, adaptive, and others) of granular computing. The characterization\nis intended for algorithm development, application to classification problems\nand possible mathematical foundations of generalizations of the approach.\nAdditionally, many open problems are posed and directions provided.\n","authors":["Mani A"],"pdf_url":"https://arxiv.org/pdf/2308.16157v1.pdf","comment":"15 Pages"},{"id":"http://arxiv.org/abs/2006.08426v4","updated":"2023-08-30T17:19:36Z","published":"2020-06-15T14:26:56Z","title":"Walking in the Shadow: A New Perspective on Descent Directions for\n Constrained Minimization","summary":" Descent directions such as movement towards Descent directions, including\nmovement towards Frank-Wolfe vertices, away-steps, in-face away-steps and\npairwise directions, have been an important design consideration in conditional\ngradient descent (CGD) variants. In this work, we attempt to demystify the\nimpact of the movement in these directions towards attaining constrained\nminimizers. The optimal local direction of descent is the directional\nderivative (i.e., shadow) of the projection of the negative gradient. We show\nthat this direction is the best away-step possible, and the continuous-time\ndynamics of moving in the shadow is equivalent to the dynamics of projected\ngradient descent (PGD), although it's non-trivial to discretize. We also show\nthat Frank-Wolfe (FW) vertices correspond to projecting onto the polytope using\nan \"infinite\" step in the direction of the negative gradient, thus providing a\nnew perspective on these steps. We combine these insights into a novel\nShadow-CG method that uses FW and shadow steps, while enjoying linear\nconvergence, with a rate that depends on the number of breakpoints in its\nprojection curve, rather than the pyramidal width. We provide a linear bound on\nthe number of breakpoints for simple polytopes and present scaling-invariant\nupper bounds for general polytopes based on the number of facets. We exemplify\nthe benefit of using Shadow-CG computationally for various applications, while\nraising an open question about tightening the bound on the number of\nbreakpoints for general polytopes.\n","authors":["Hassan Mortagy","Swati Gupta","Sebastian Pokutta"],"pdf_url":"https://arxiv.org/pdf/2006.08426v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16150v1","updated":"2023-08-30T17:16:02Z","published":"2023-08-30T17:16:02Z","title":"Modality Cycles with Masked Conditional Diffusion for Unsupervised\n Anomaly Segmentation in MRI","summary":" Unsupervised anomaly segmentation aims to detect patterns that are distinct\nfrom any patterns processed during training, commonly called abnormal or\nout-of-distribution patterns, without providing any associated manual\nsegmentations. Since anomalies during deployment can lead to model failure,\ndetecting the anomaly can enhance the reliability of models, which is valuable\nin high-risk domains like medical imaging. This paper introduces Masked\nModality Cycles with Conditional Diffusion (MMCCD), a method that enables\nsegmentation of anomalies across diverse patterns in multimodal MRI. The method\nis based on two fundamental ideas. First, we propose the use of cyclic modality\ntranslation as a mechanism for enabling abnormality detection.\nImage-translation models learn tissue-specific modality mappings, which are\ncharacteristic of tissue physiology. Thus, these learned mappings fail to\ntranslate tissues or image patterns that have never been encountered during\ntraining, and the error enables their segmentation. Furthermore, we combine\nimage translation with a masked conditional diffusion model, which attempts to\n`imagine' what tissue exists under a masked area, further exposing unknown\npatterns as the generative model fails to recreate them. We evaluate our method\non a proxy task by training on healthy-looking slices of BraTS2021\nmulti-modality MRIs and testing on slices with tumors. We show that our method\ncompares favorably to previous unsupervised approaches based on image\nreconstruction and denoising with autoencoders and diffusion models.\n","authors":["Ziyun Liang","Harry Anthony","Felix Wagner","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2308.16150v1.pdf","comment":"Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI\n 2023"},{"id":"http://arxiv.org/abs/2308.16149v1","updated":"2023-08-30T17:07:17Z","published":"2023-08-30T17:07:17Z","title":"Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open\n Generative Large Language Models","summary":" We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric\nfoundation and instruction-tuned open generative large language models (LLMs).\nThe models are based on the GPT-3 decoder-only architecture and are pretrained\non a mixture of Arabic and English texts, including source code in various\nprogramming languages. With 13 billion parameters, they demonstrate better\nknowledge and reasoning capabilities in Arabic than any existing open Arabic\nand multilingual models by a sizable margin, based on extensive evaluation.\nMoreover, the models are competitive in English compared to English-centric\nopen models of similar size, despite being trained on much less English data.\nWe provide a detailed description of the training, the tuning, the safety\nalignment, and the evaluation of the models. We release two open versions of\nthe model -- the foundation Jais model, and an instruction-tuned Jais-chat\nvariant -- with the aim of promoting research on Arabic LLMs. Available at\nhttps://huggingface.co/inception-mbzuai/jais-13b-chat\n","authors":["Neha Sengupta","Sunil Kumar Sahu","Bokang Jia","Satheesh Katipomu","Haonan Li","Fajri Koto","Osama Mohammed Afzal","Samta Kamboj","Onkar Pandit","Rahul Pal","Lalit Pradhan","Zain Muhammad Mujahid","Massa Baali","Alham Fikri Aji","Zhengzhong Liu","Andy Hock","Andrew Feldman","Jonathan Lee","Andrew Jackson","Preslav Nakov","Timothy Baldwin","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2308.16149v1.pdf","comment":"Arabic-centric, foundation model, large-language model, LLM,\n generative model, instruction-tuned, Jais, Jais-chat"},{"id":"http://arxiv.org/abs/2308.01981v2","updated":"2023-08-30T17:02:55Z","published":"2023-08-03T18:28:50Z","title":"CartiMorph: a framework for automated knee articular cartilage\n morphometrics","summary":" We introduce CartiMorph, a framework for automated knee articular cartilage\nmorphometrics. It takes an image as input and generates quantitative metrics\nfor cartilage subregions, including the percentage of full-thickness cartilage\nloss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the\npower of deep learning models for hierarchical image feature representation.\nDeep learning models were trained and validated for tissue segmentation,\ntemplate construction, and template-to-image registration. We established\nmethods for surface-normal-based cartilage thickness mapping, FCL estimation,\nand rule-based cartilage parcellation. Our cartilage thickness map showed less\nerror in thin and peripheral regions. We evaluated the effectiveness of the\nadopted segmentation model by comparing the quantitative metrics obtained from\nmodel segmentation and those from manual segmentation. The root-mean-squared\ndeviation of the FCL measurements was less than 8%, and strong correlations\nwere observed for the mean thickness (Pearson's correlation coefficient $\\rho\n\\in [0.82,0.97]$), surface area ($\\rho \\in [0.82,0.98]$) and volume ($\\rho \\in\n[0.89,0.98]$) measurements. We compared our FCL measurements with those from a\nprevious study and found that our measurements deviated less from the ground\ntruths. We observed superior performance of the proposed rule-based cartilage\nparcellation method compared with the atlas-based approach. CartiMorph has the\npotential to promote imaging biomarkers discovery for knee osteoarthritis.\n","authors":["Yongcheng Yao","Junru Zhong","Liping Zhang","Sheheryar Khan","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01981v2.pdf","comment":"To be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.16139v1","updated":"2023-08-30T16:52:20Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.14815v2","updated":"2023-08-30T16:31:53Z","published":"2023-08-28T18:06:24Z","title":"Distributionally Robust Statistical Verification with Imprecise Neural\n Networks","summary":" A particularly challenging problem in AI safety is providing guarantees on\nthe behavior of high-dimensional autonomous systems. Verification approaches\ncentered around reachability analysis fail to scale, and purely statistical\napproaches are constrained by the distributional assumptions about the sampling\nprocess. Instead, we pose a distributionally robust version of the statistical\nverification problem for black-box systems, where our performance guarantees\nhold over a large family of distributions. This paper proposes a novel approach\nbased on a combination of active learning, uncertainty quantification, and\nneural network verification. A central piece of our approach is an ensemble\ntechnique called Imprecise Neural Networks, which provides the uncertainty to\nguide active learning. The active learning uses an exhaustive neural-network\nverification tool Sherlock to collect samples. An evaluation on multiple\nphysical simulators in the openAI gym Mujoco environments with\nreinforcement-learned controllers demonstrates that our approach can provide\nuseful and scalable guarantees for high-dimensional systems.\n","authors":["Souradeep Dutta","Michele Caprio","Vivian Lin","Matthew Cleaveland","Kuk Jin Jang","Ivan Ruchkin","Oleg Sokolsky","Insup Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14683v2","updated":"2023-08-30T16:30:29Z","published":"2023-05-24T03:44:50Z","title":"On progressive sharpening, flat minima and generalisation","summary":" We present a new approach to understanding the relationship between loss\ncurvature and input-output model behaviour in deep learning. Specifically, we\nuse existing empirical analyses of the spectrum of deep network loss Hessians\nto ground an ansatz tying together the loss Hessian and the input-output\nJacobian of a deep neural network over training samples throughout training. We\nthen prove a series of theoretical results which quantify the degree to which\nthe input-output Jacobian of a model approximates its Lipschitz norm over a\ndata distribution, and deduce a novel generalisation bound in terms of the\nempirical Jacobian. We use our ansatz, together with our theoretical results,\nto give a new account of the recently observed progressive sharpening\nphenomenon, as well as the generalisation properties of flat minima.\nExperimental evidence is provided to validate our claims.\n","authors":["Lachlan Ewen MacDonald","Jack Valmadre","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2305.14683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16122v1","updated":"2023-08-30T16:21:02Z","published":"2023-08-30T16:21:02Z","title":"Spatial Graph Coarsening: Weather and Weekday Prediction with London's\n Bike-Sharing Service using GNN","summary":" This study introduced the use of Graph Neural Network (GNN) for predicting\nthe weather and weekday of a day in London, from the dataset of Santander\nCycles bike-sharing system as a graph classification task. The proposed GNN\nmodels newly introduced (i) a concatenation operator of graph features with\ntrained node embeddings and (ii) a graph coarsening operator based on\ngeographical contiguity, namely \"Spatial Graph Coarsening\". With the node\nfeatures of land-use characteristics and number of households around the bike\nstations and graph features of temperatures in the city, our proposed models\noutperformed the baseline model in cross-entropy loss and accuracy of the\nvalidation dataset.\n","authors":["Yuta Sato","Pak Hei Lam","Shruti Gupta","Fareesah Hussain"],"pdf_url":"https://arxiv.org/pdf/2308.16122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16113v1","updated":"2023-08-30T16:14:20Z","published":"2023-08-30T16:14:20Z","title":"survex: an R package for explaining machine learning survival models","summary":" Due to their flexibility and superior performance, machine learning models\nfrequently complement and outperform traditional statistical survival models.\nHowever, their widespread adoption is hindered by a lack of user-friendly tools\nto explain their internal operations and prediction rationales. To tackle this\nissue, we introduce the survex R package, which provides a cohesive framework\nfor explaining any survival model by applying explainable artificial\nintelligence techniques. The capabilities of the proposed software encompass\nunderstanding and diagnosing survival models, which can lead to their\nimprovement. By revealing insights into the decision-making process, such as\nvariable effects and importances, survex enables the assessment of model\nreliability and the detection of biases. Thus, transparency and responsibility\nmay be promoted in sensitive areas, such as biomedical research and healthcare\napplications.\n","authors":["Mikołaj Spytek","Mateusz Krzyziński","Sophie Hanna Langbein","Hubert Baniecki","Marvin N. Wright","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.16113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11582v2","updated":"2023-08-30T16:06:27Z","published":"2023-05-19T10:43:57Z","title":"What You Hear Is What You See: Audio Quality Metrics From Image Quality\n Metrics","summary":" In this study, we investigate the feasibility of utilizing state-of-the-art\nimage perceptual metrics for evaluating audio signals by representing them as\nspectrograms. The encouraging outcome of the proposed approach is based on the\nsimilarity between the neural mechanisms in the auditory and visual pathways.\nFurthermore, we customise one of the metrics which has a psychoacoustically\nplausible architecture to account for the peculiarities of sound signals. We\nevaluate the effectiveness of our proposed metric and several baseline metrics\nusing a music dataset, with promising results in terms of the correlation\nbetween the metrics and the perceived quality of audio as rated by human\nevaluators.\n","authors":["Tashi Namgyal","Alexander Hepburn","Raul Santos-Rodriguez","Valero Laparra","Jesus Malo"],"pdf_url":"https://arxiv.org/pdf/2305.11582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07001v2","updated":"2023-08-30T15:58:45Z","published":"2023-06-12T10:10:57Z","title":"Cancellation-Free Regret Bounds for Lagrangian Approaches in Constrained\n Markov Decision Processes","summary":" Constrained Markov Decision Processes (CMDPs) are one of the common ways to\nmodel safe reinforcement learning problems, where constraint functions model\nthe safety objectives. Lagrangian-based dual or primal-dual algorithms provide\nefficient methods for learning in CMDPs. For these algorithms, the currently\nknown regret bounds in the finite-horizon setting allow for a \"cancellation of\nerrors\"; one can compensate for a constraint violation in one episode with a\nstrict constraint satisfaction in another. However, we do not consider such a\nbehavior safe in practical applications. In this paper, we overcome this\nweakness by proposing a novel model-based dual algorithm OptAug-CMDP for\ntabular finite-horizon CMDPs. Our algorithm is motivated by the augmented\nLagrangian method and can be performed efficiently. We show that during $K$\nepisodes of exploring the CMDP, our algorithm obtains a regret of\n$\\tilde{O}(\\sqrt{K})$ for both the objective and the constraint violation.\nUnlike existing Lagrangian approaches, our algorithm achieves this regret\nwithout the need for the cancellation of errors.\n","authors":["Adrian Müller","Pragnya Alatur","Giorgia Ramponi","Niao He"],"pdf_url":"https://arxiv.org/pdf/2306.07001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16105v1","updated":"2023-08-30T15:54:06Z","published":"2023-08-30T15:54:06Z","title":"Advanced Deep Regression Models for Forecasting Time Series Oil\n Production","summary":" Global oil demand is rapidly increasing and is expected to reach 106.3\nmillion barrels per day by 2040. Thus, it is vital for hydrocarbon extraction\nindustries to forecast their production to optimize their operations and avoid\nlosses. Big companies have realized that exploiting the power of deep learning\n(DL) and the massive amount of data from various oil wells for this purpose can\nsave a lot of operational costs and reduce unwanted environmental impacts. In\nthis direction, researchers have proposed models using conventional machine\nlearning (ML) techniques for oil production forecasting. However, these\ntechniques are inappropriate for this problem as they can not capture\nhistorical patterns found in time series data, resulting in inaccurate\npredictions. This research aims to overcome these issues by developing advanced\ndata-driven regression models using sequential convolutions and long short-term\nmemory (LSTM) units. Exhaustive analyses are conducted to select the optimal\nsequence length, model hyperparameters, and cross-well dataset formation to\nbuild highly generalized robust models. A comprehensive experimental study on\nVolve oilfield data validates the proposed models. It reveals that the\nLSTM-based sequence learning model can predict oil production better than the\n1-D convolutional neural network (CNN) with mean absolute error (MAE) and R2\nscore of 111.16 and 0.98, respectively. It is also found that the LSTM-based\nmodel performs better than all the existing state-of-the-art solutions and\nachieves a 37% improvement compared to a standard linear regression, which is\nconsidered the baseline model in this work.\n","authors":["Siavash Hosseini","Thangarajah Akilan"],"pdf_url":"https://arxiv.org/pdf/2308.16105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12767v2","updated":"2023-08-30T15:52:34Z","published":"2023-08-24T13:14:49Z","title":"On the Consistency of Average Embeddings for Item Recommendation","summary":" A prevalent practice in recommender systems consists in averaging item\nembeddings to represent users or higher-level concepts in the same embedding\nspace. This paper investigates the relevance of such a practice. For this\npurpose, we propose an expected precision score, designed to measure the\nconsistency of an average embedding relative to the items used for its\nconstruction. We subsequently analyze the mathematical expression of this score\nin a theoretical setting with specific assumptions, as well as its empirical\nbehavior on real-world data from music streaming services. Our results\nemphasize that real-world averages are less consistent for recommendation,\nwhich paves the way for future research to better align real-world embeddings\nwith assumptions from our theoretical setting.\n","authors":["Walid Bendada","Guillaume Salha-Galvan","Romain Hennequin","Thomas Bouabça","Tristan Cazenave"],"pdf_url":"https://arxiv.org/pdf/2308.12767v2.pdf","comment":"17th ACM Conference on Recommender Systems (RecSys 2023)"},{"id":"http://arxiv.org/abs/2104.10751v3","updated":"2023-08-30T15:51:05Z","published":"2021-04-21T20:31:28Z","title":"Rule Generation for Classification: Scalability, Interpretability, and\n Fairness","summary":" We introduce a new rule-based optimization method for classification with\nconstraints. The proposed method leverages column generation for linear\nprogramming, and hence, is scalable to large datasets. The resulting pricing\nsubproblem is shown to be NP-Hard. We recourse to a decision tree-based\nheuristic and solve a proxy pricing subproblem for acceleration. The method\nreturns a set of rules along with their optimal weights indicating the\nimportance of each rule for learning. We address interpretability and fairness\nby assigning cost coefficients to the rules and introducing additional\nconstraints. In particular, we focus on local interpretability and generalize\nseparation criterion in fairness to multiple sensitive attributes and classes.\nWe test the performance of the proposed methodology on a collection of datasets\nand present a case study to elaborate on its different aspects. The proposed\nrule-based learning method exhibits a good compromise between local\ninterpretability and fairness on the one side, and accuracy on the other side.\n","authors":["Adia C. Lumadjeng","Tabea Röber","M. Hakan Akyüz","Ş. İlker Birbil"],"pdf_url":"https://arxiv.org/pdf/2104.10751v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03975v3","updated":"2023-08-30T15:32:59Z","published":"2021-10-08T08:49:35Z","title":"Tensor train completion: local recovery guarantees via Riemannian\n optimization","summary":" In this work, we estimate the number of randomly selected elements of a\ntensor that with high probability guarantees local convergence of Riemannian\ngradient descent for tensor train completion. We derive a new bound for the\northogonal projections onto the tangent spaces based on the harmonic mean of\nthe unfoldings' singular values and introduce a notion of core coherence for\ntensor trains. We also extend the results to tensor train completion with\nauxiliary subspace information and obtain the corresponding local convergence\nguarantees.\n","authors":["Stanislav Budzinskiy","Nikolai Zamarashkin"],"pdf_url":"https://arxiv.org/pdf/2110.03975v3.pdf","comment":"1 figure added; Accepted version"},{"id":"http://arxiv.org/abs/2308.16089v1","updated":"2023-08-30T15:26:35Z","published":"2023-08-30T15:26:35Z","title":"Application of Zone Method based Machine Learning and Physics-Informed\n Neural Networks in Reheating Furnaces","summary":" Despite the high economic relevance of Foundation Industries, certain\ncomponents like Reheating furnaces within their manufacturing chain are\nenergy-intensive. Notable energy consumption reduction could be obtained by\nreducing the overall heating time in furnaces. Computer-integrated Machine\nLearning (ML) and Artificial Intelligence (AI) powered control systems in\nfurnaces could be enablers in achieving the Net-Zero goals in Foundation\nIndustries for sustainable manufacturing.\n In this work, due to the infeasibility of achieving good quality data in\nscenarios like reheating furnaces, classical Hottel's zone method based\ncomputational model has been used to generate data for ML and Deep Learning\n(DL) based model training via regression. It should be noted that the zone\nmethod provides an elegant way to model the physical phenomenon of Radiative\nHeat Transfer (RHT), the dominating heat transfer mechanism in high-temperature\nprocesses inside heating furnaces. Using this data, an extensive comparison\namong a wide range of state-of-the-art, representative ML and DL methods has\nbeen made against their temperature prediction performances in varying furnace\nenvironments. Owing to their holistic balance among inference times and model\nperformance, DL stands out among its counterparts. To further enhance the\nOut-Of-Distribution (OOD) generalization capability of the trained DL models,\nwe propose a Physics-Informed Neural Network (PINN) by incorporating prior\nphysical knowledge using a set of novel Energy-Balance regularizers. Our setup\nis a generic framework, is geometry-agnostic of the 3D structure of the\nunderlying furnace, and as such could accommodate any standard ML regression\nmodel, to serve as a Digital Twin of the underlying physical processes, for\ntransitioning Foundation Industries towards Industry 4.0.\n","authors":["Ujjal Kr Dutta","Aldo Lipani","Chuan Wang","Yukun Hu"],"pdf_url":"https://arxiv.org/pdf/2308.16089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11197v2","updated":"2023-08-30T15:26:00Z","published":"2023-02-22T08:14:24Z","title":"Quantized Low-Rank Multivariate Regression with Random Dithering","summary":" Low-rank multivariate regression (LRMR) is an important statistical learning\nmodel that combines highly correlated tasks as a multiresponse regression\nproblem with low-rank priori on the coefficient matrix. In this paper, we study\nquantized LRMR, a practical setting where the responses and/or the covariates\nare discretized to finite precision. We focus on the estimation of the\nunderlying coefficient matrix. To make consistent estimator that could achieve\narbitrarily small error possible, we employ uniform quantization with random\ndithering, i.e., we add appropriate random noise to the data before\nquantization. Specifically, uniform dither and triangular dither are used for\nresponses and covariates, respectively. Based on the quantized data, we propose\nthe constrained Lasso and regularized Lasso estimators, and derive the\nnon-asymptotic error bounds. With the aid of dithering, the estimators achieve\nminimax optimal rate, while quantization only slightly worsens the\nmultiplicative factor in the error rate. Moreover, we extend our results to a\nlow-rank regression model with matrix responses. We corroborate and demonstrate\nour theoretical results via simulations on synthetic data or image restoration.\n","authors":["Junren Chen","Yueqi Wang","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2302.11197v2.pdf","comment":"16 pages (Submitted)"},{"id":"http://arxiv.org/abs/2305.09438v3","updated":"2023-08-30T14:56:16Z","published":"2023-05-16T13:50:24Z","title":"MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with\n Transformers","summary":" Message Passing Interface (MPI) plays a crucial role in distributed memory\nparallelization across multiple nodes. However, parallelizing MPI code\nmanually, and specifically, performing domain decomposition, is a challenging,\nerror-prone task. In this paper, we address this problem by developing\nMPI-RICAL, a novel data-driven, programming-assistance tool that assists\nprogrammers in writing domain decomposition based distributed memory\nparallelization code. Specifically, we train a supervised language model to\nsuggest MPI functions and their proper locations in the code on the fly. We\nalso introduce MPICodeCorpus, the first publicly available corpus of MPI-based\nparallel programs that is created by mining more than 15,000 open-source\nrepositories on GitHub. Experimental results have been done on MPICodeCorpus\nand more importantly, on a compiled benchmark of MPI-based parallel programs\nfor numerical computations that represent real-world scientific applications.\nMPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating\nits accuracy in suggesting correct MPI functions at appropriate code\nlocations.. The source code used in this work, as well as other relevant\nsources, are available at:\nhttps://github.com/Scientific-Computing-Lab-NRCN/MPI-rical\n","authors":["Nadav Schneider","Tal Kadosh","Niranjan Hasabnis","Timothy Mattson","Yuval Pinter","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2305.09438v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16067v1","updated":"2023-08-30T14:44:04Z","published":"2023-08-30T14:44:04Z","title":"Consensus of state of the art mortality prediction models: From\n all-cause mortality to sudden death prediction","summary":" Worldwide, many millions of people die suddenly and unexpectedly each year,\neither with or without a prior history of cardiovascular disease. Such events\nare sparse (once in a lifetime), many victims will not have had prior\ninvestigations for cardiac disease and many different definitions of sudden\ndeath exist. Accordingly, sudden death is hard to predict.\n This analysis used NHS Electronic Health Records (EHRs) for people aged\n$\\geq$50 years living in the Greater Glasgow and Clyde (GG\\&C) region in 2010\n(n = 380,000) to try to overcome these challenges. We investigated whether\nmedical history, blood tests, prescription of medicines, and hospitalisations\nmight, in combination, predict a heightened risk of sudden death.\n We compared the performance of models trained to predict either sudden death\nor all-cause mortality. We built six models for each outcome of interest: three\ntaken from state-of-the-art research (BEHRT, Deepr and Deep Patient), and three\nof our own creation. We trained these using two different data representations:\na language-based representation, and a sparse temporal matrix.\n We used global interpretability to understand the most important features of\neach model, and compare how much agreement there was amongst models using Rank\nBiased Overlap. It is challenging to account for correlated variables without\nincreasing the complexity of the interpretability technique. We overcame this\nby clustering features into groups and comparing the most important groups for\neach model. We found the agreement between models to be much higher when\naccounting for correlated variables.\n Our analysis emphasises the challenge of predicting sudden death and\nemphasises the need for better understanding and interpretation of machine\nlearning models applied to healthcare applications.\n","authors":["Dr Yola Jones","Dr Fani Deligianni","Dr Jeff Dalton","Dr Pierpaolo Pellicori","Professor John G F Cleland"],"pdf_url":"https://arxiv.org/pdf/2308.16067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08491v3","updated":"2023-08-30T14:43:46Z","published":"2023-01-20T09:36:42Z","title":"Modeling Moral Choices in Social Dilemmas with Multi-Agent Reinforcement\n Learning","summary":" Practical uses of Artificial Intelligence (AI) in the real world have\ndemonstrated the importance of embedding moral choices into intelligent agents.\nThey have also highlighted that defining top-down ethical constraints on AI\naccording to any one type of morality is extremely challenging and can pose\nrisks. A bottom-up learning approach may be more appropriate for studying and\ndeveloping ethical behavior in AI agents. In particular, we believe that an\ninteresting and insightful starting point is the analysis of emergent behavior\nof Reinforcement Learning (RL) agents that act according to a predefined set of\nmoral rewards in social dilemmas.\n In this work, we present a systematic analysis of the choices made by\nintrinsically-motivated RL agents whose rewards are based on moral theories. We\naim to design reward structures that are simplified yet representative of a set\nof key ethical systems. Therefore, we first define moral reward functions that\ndistinguish between consequence- and norm-based agents, between morality based\non societal norms or internal virtues, and between single- and mixed-virtue\n(e.g., multi-objective) methodologies. Then, we evaluate our approach by\nmodeling repeated dyadic interactions between learning moral agents in three\niterated social dilemma games (Prisoner's Dilemma, Volunteer's Dilemma and Stag\nHunt). We analyze the impact of different types of morality on the emergence of\ncooperation, defection or exploitation, and the corresponding social outcomes.\nFinally, we discuss the implications of these findings for the development of\nmoral agents in artificial and mixed human-AI societies.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2301.08491v3.pdf","comment":"Accepted at IJCAI 2023 (32nd International Joint Conference on\n Artificial Intelligence - Macao, S.A.R.)"},{"id":"http://arxiv.org/abs/2107.07752v2","updated":"2023-08-30T14:39:24Z","published":"2021-07-16T08:07:22Z","title":"NeXtQSM -- A complete deep learning pipeline for data-consistent\n quantitative susceptibility mapping trained with hybrid data","summary":" Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great\npotential in recent years, obtaining similar results to established\nnon-learning approaches. Many current deep learning approaches are not data\nconsistent, require in vivo training data or solve the QSM problem in\nconsecutive steps resulting in the propagation of errors. Here we aim to\novercome these limitations and developed a framework to solve the QSM\nprocessing steps jointly. We developed a new hybrid training data generation\nmethod that enables the end-to-end training for solving background field\ncorrection and dipole inversion in a data-consistent fashion using a\nvariational network that combines the QSM model term and a learned regularizer.\nWe demonstrate that NeXtQSM overcomes the limitations of previous deep learning\nmethods. NeXtQSM offers a new deep learning based pipeline for computing\nquantitative susceptibility maps that integrates each processing step into the\ntraining and provides results that are robust and fast.\n","authors":["Francesco Cognolato","Kieran O'Brien","Jin Jin","Simon Robinson","Frederik B. Laun","Markus Barth","Steffen Bollmann"],"pdf_url":"https://arxiv.org/pdf/2107.07752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16061v1","updated":"2023-08-30T14:36:25Z","published":"2023-08-30T14:36:25Z","title":"Conti Inc.: Understanding the Internal Discussions of a large\n Ransomware-as-a-Service Operator with Machine Learning","summary":" Ransomware-as-a-service (RaaS) is increasing the scale and complexity of\nransomware attacks. Understanding the internal operations behind RaaS has been\na challenge due to the illegality of such activities. The recent chat leak of\nthe Conti RaaS operator, one of the most infamous ransomware operators on the\ninternational scene, offers a key opportunity to better understand the inner\nworkings of such organizations. This paper analyzes the main topic discussions\nin the Conti chat leak using machine learning techniques such as Natural\nLanguage Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as\nvisualization strategies. Five discussion topics are found: 1) Business, 2)\nTechnical, 3) Internal tasking/Management, 4) Malware, and 5) Customer\nService/Problem Solving. Moreover, the distribution of topics among Conti\nmembers shows that only 4% of individuals have specialized discussions while\nalmost all individuals (96%) are all-rounders, meaning that their discussions\nrevolve around the five topics. The results also indicate that a significant\nproportion of Conti discussions are non-tech related. This study thus\nhighlights that running such large RaaS operations requires a workforce skilled\nbeyond technical abilities, with individuals involved in various tasks, from\nmanagement to customer service or problem solving. The discussion topics also\nshow that the organization behind the Conti RaaS oper5086933ator shares\nsimilarities with a large firm. We conclude that, although RaaS represents an\nexample of specialization in the cybercrime industry, only a few members are\nspecialized in one topic, while the rest runs and coordinates the RaaS\noperation.\n","authors":["Estelle Ruellan","Masarah Paquet-Clouston","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.16061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16059v1","updated":"2023-08-30T14:31:24Z","published":"2023-08-30T14:31:24Z","title":"A Parameter-Free Two-Bit Covariance Estimator with Improved Operator\n Norm Error Rate","summary":" A covariance matrix estimator using two bits per entry was recently developed\nby Dirksen, Maly and Rauhut [Annals of Statistics, 50(6), pp. 3538-3562]. The\nestimator achieves near minimax rate for general sub-Gaussian distributions,\nbut also suffers from two downsides: theoretically, there is an essential gap\non operator norm error between their estimator and sample covariance when the\ndiagonal of the covariance matrix is dominated by only a few entries;\npractically, its performance heavily relies on the dithering scale, which needs\nto be tuned according to some unknown parameters. In this work, we propose a\nnew 2-bit covariance matrix estimator that simultaneously addresses both\nissues. Unlike the sign quantizer associated with uniform dither in Dirksen et\nal., we adopt a triangular dither prior to a 2-bit quantizer inspired by the\nmulti-bit uniform quantizer. By employing dithering scales varying across\nentries, our estimator enjoys an improved operator norm error rate that depends\non the effective rank of the underlying covariance matrix rather than the\nambient dimension, thus closing the theoretical gap. Moreover, our proposed\nmethod eliminates the need of any tuning parameter, as the dithering scales are\nentirely determined by the data. Experimental results under Gaussian samples\nare provided to showcase the impressive numerical performance of our estimator.\nRemarkably, by halving the dithering scales, our estimator oftentimes achieves\noperator norm errors less than twice of the errors of sample covariance.\n","authors":["Junren Chen","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2308.16059v1.pdf","comment":"24 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.16056v1","updated":"2023-08-30T14:28:26Z","published":"2023-08-30T14:28:26Z","title":"Low-Rank Multitask Learning based on Tensorized SVMs and LSSVMs","summary":" Multitask learning (MTL) leverages task-relatedness to enhance performance.\nWith the emergence of multimodal data, tasks can now be referenced by multiple\nindices. In this paper, we employ high-order tensors, with each mode\ncorresponding to a task index, to naturally represent tasks referenced by\nmultiple indices and preserve their structural relations. Based on this\nrepresentation, we propose a general framework of low-rank MTL methods with\ntensorized support vector machines (SVMs) and least square support vector\nmachines (LSSVMs), where the CP factorization is deployed over the coefficient\ntensor. Our approach allows to model the task relation through a linear\ncombination of shared factors weighted by task-specific factors and is\ngeneralized to both classification and regression problems. Through the\nalternating optimization scheme and the Lagrangian function, each subproblem is\ntransformed into a convex problem, formulated as a quadratic programming or\nlinear system in the dual form. In contrast to previous MTL frameworks, our\ndecision function in the dual induces a weighted kernel function with a\ntask-coupling term characterized by the similarities of the task-specific\nfactors, better revealing the explicit relations across tasks in MTL.\nExperimental results validate the effectiveness and superiority of our proposed\nmethods compared to existing state-of-the-art approaches in MTL. The code of\nimplementation will be available at https://github.com/liujiani0216/TSVM-MTL.\n","authors":["Jiani Liu","Qinghua Tao","Ce Zhu","Yipeng Liu","Xiaolin Huang","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2308.16056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00736v3","updated":"2023-08-30T14:11:10Z","published":"2022-12-01T18:29:48Z","title":"An exponentially-growing family of universal quantum circuits","summary":" Quantum machine learning has become an area of growing interest but has\ncertain theoretical and hardware-specific limitations. Notably, the problem of\nvanishing gradients, or barren plateaus, renders the training impossible for\ncircuits with high qubit counts, imposing a limit on the number of qubits that\ndata scientists can use for solving problems. Independently, angle-embedded\nsupervised quantum neural networks were shown to produce truncated Fourier\nseries with a degree directly dependent on two factors: the depth of the\nencoding and the number of parallel qubits the encoding applied to. The degree\nof the Fourier series limits the model expressivity. This work introduces two\nnew architectures whose Fourier degrees grow exponentially: the sequential and\nparallel exponential quantum machine learning architectures. This is done by\nefficiently using the available Hilbert space when encoding, increasing the\nexpressivity of the quantum encoding. Therefore, the exponential growth allows\nstaying at the low-qubit limit to create highly expressive circuits avoiding\nbarren plateaus. Practically, parallel exponential architecture was shown to\noutperform the existing linear architectures by reducing their final mean\nsquare error value by up to 44.7% in a one-dimensional test problem.\nFurthermore, the feasibility of this technique was also shown on a trapped ion\nquantum processing unit.\n","authors":["Mo Kordzanganeh","Pavel Sekatski","Leonid Fedichkin","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2212.00736v3.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2303.12247v2","updated":"2023-08-30T14:09:13Z","published":"2023-03-22T01:01:14Z","title":"Exploring the Benefits of Visual Prompting in Differential Privacy","summary":" Visual Prompting (VP) is an emerging and powerful technique that allows\nsample-efficient adaptation to downstream tasks by engineering a well-trained\nfrozen source model. In this work, we explore the benefits of VP in\nconstructing compelling neural network classifiers with differential privacy\n(DP). We explore and integrate VP into canonical DP training methods and\ndemonstrate its simplicity and efficiency. In particular, we discover that VP\nin tandem with PATE, a state-of-the-art DP training method that leverages the\nknowledge transfer from an ensemble of teachers, achieves the state-of-the-art\nprivacy-utility trade-off with minimum expenditure of privacy budget. Moreover,\nwe conduct additional experiments on cross-domain image classification with a\nsufficient domain gap to further unveil the advantage of VP in DP. Lastly, we\nalso conduct extensive ablation studies to validate the effectiveness and\ncontribution of VP under DP consideration. Our code is available at\n(https://github.com/EzzzLi/Prompt-PATE).\n","authors":["Yizhe Li","Yu-Lin Tsai","Xuebin Ren","Chia-Mu Yu","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12247v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.06208v3","updated":"2023-08-30T14:07:49Z","published":"2023-06-05T23:07:01Z","title":"DeltaNN: Assessing the Impact of Computational Environment Parameters on\n the Performance of Image Recognition Models","summary":" Image recognition tasks typically use deep learning and require enormous\nprocessing power, thus relying on hardware accelerators like GPUs and TPUs for\nfast, timely processing. Failure in real-time image recognition tasks can occur\ndue to sub-optimal mapping on hardware accelerators during model deployment,\nwhich may lead to timing uncertainty and erroneous behavior. Mapping on\nhardware accelerators is done using multiple software components like deep\nlearning frameworks, compilers, and device libraries, that we refer to as the\ncomputational environment. Owing to the increased use of image recognition\ntasks in safety-critical applications like autonomous driving and medical\nimaging, it is imperative to assess their robustness to changes in the\ncomputational environment, as the impact of parameters like deep learning\nframeworks, compiler optimizations, and hardware devices on model performance\nand correctness is not yet well understood.\n In this paper we present a differential testing framework, DeltaNN, that\nallows us to assess the impact of different computational environment\nparameters on the performance of image recognition models during deployment,\npost training. DeltaNN generates different implementations of a given image\nrecognition model for variations in environment parameters, namely, deep\nlearning frameworks, compiler optimizations and hardware devices and analyzes\ndifferences in model performance as a result. Using DeltaNN, we conduct an\nempirical study of robustness analysis of three popular image recognition\nmodels using the ImageNet dataset. We report the impact in terms of\nmisclassifications and inference time differences across different settings. In\ntotal, we observed up to 72% output label differences across deep learning\nframeworks, and up to 81% unexpected performance degradation in terms of\ninference time, when applying compiler optimizations.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06208v3.pdf","comment":"11 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2305.08854v2","updated":"2023-08-30T14:01:36Z","published":"2023-05-15T17:59:57Z","title":"Laughing Matters: Introducing Laughing-Face Generation using Diffusion\n Models","summary":" Speech-driven animation has gained significant traction in recent years, with\ncurrent methods achieving near-photorealistic results. However, the field\nremains underexplored regarding non-verbal communication despite evidence\ndemonstrating its importance in human interaction. In particular, generating\nlaughter sequences presents a unique challenge due to the intricacy and nuances\nof this behaviour. This paper aims to bridge this gap by proposing a novel\nmodel capable of generating realistic laughter sequences, given a still\nportrait and an audio clip containing laughter. We highlight the failure cases\nof traditional facial animation methods and leverage recent advances in\ndiffusion models to produce convincing laughter videos. We train our model on a\ndiverse set of laughter datasets and introduce an evaluation metric\nspecifically designed for laughter. When compared with previous speech-driven\napproaches, our model achieves state-of-the-art performance across all metrics,\neven when these are re-trained for laughter generation. Our code and project\nare publicly available\n","authors":["Antoni Bigata Casademunt","Rodrigo Mira","Nikita Drobyshev","Konstantinos Vougioukas","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2305.08854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06157v3","updated":"2023-08-30T13:41:23Z","published":"2023-06-10T23:50:02Z","title":"Fault Localization for Buggy Deep Learning Framework Conversions in\n Image Recognition","summary":" When deploying Deep Neural Networks (DNNs), developers often convert models\nfrom one deep learning framework to another (e.g., TensorFlow to PyTorch).\nHowever, this process is error-prone and can impact target model accuracy. To\nidentify the extent of such impact, we perform and briefly present a\ndifferential analysis against three DNNs widely used for image recognition\n(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep\nlearning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which\nrevealed numerous model crashes and output label discrepancies of up to 72%. To\nmitigate such errors, we present a novel approach towards fault localization\nand repair of buggy deep learning framework conversions, focusing on\npre-trained image recognition models. Our technique consists of four stages of\nanalysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters,\nand 4) graph representation. In addition, we propose various strategies towards\nfault repair of the faults detected. We implement our technique on top of the\nApache TVM deep learning compiler, and we test it by conducting a preliminary\nfault localization analysis for the conversion of InceptionV3 from TF to\nTFLite. Our approach detected a fault in a common DNN converter tool, which\nintroduced precision errors in weights, reducing model accuracy. After our\nfault localization, we repaired the issue, reducing our conversion error to\nzero.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06157v3.pdf","comment":"5 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.15016v2","updated":"2023-08-30T13:33:59Z","published":"2023-07-27T17:19:32Z","title":"How Good is Google Bard's Visual Understanding? An Empirical Study on\n Open Challenges","summary":" Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in\nthe field of conversational AI. Notably, Bard has recently been updated to\nhandle visual inputs alongside text prompts during conversations. Given Bard's\nimpressive track record in handling textual inputs, we explore its capabilities\nin understanding and interpreting visual data (images) conditioned by text\nquestions. This exploration holds the potential to unveil new insights and\nchallenges for Bard and other forthcoming multi-modal Generative models,\nespecially in addressing complex computer vision problems that demand accurate\nvisual and language understanding. Specifically, in this study, we focus on 15\ndiverse task scenarios encompassing regular, camouflaged, medical, under-water\nand remote sensing data to comprehensively evaluate Bard's performance. Our\nprimary finding indicates that Bard still struggles in these vision scenarios,\nhighlighting the significant gap in vision-based understanding that needs to be\nbridged in future developments. We expect that this empirical study will prove\nvaluable in advancing future models, leading to enhanced capabilities in\ncomprehending and interpreting fine-grained visual data. Our project is\nreleased on https://github.com/htqin/GoogleBard-VisUnderstand\n","authors":["Haotong Qin","Ge-Peng Ji","Salman Khan","Deng-Ping Fan","Fahad Shahbaz Khan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.15016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16022v1","updated":"2023-08-30T13:22:20Z","published":"2023-08-30T13:22:20Z","title":"PAVI: Plate-Amortized Variational Inference","summary":" Given observed data and a probabilistic generative model, Bayesian inference\nsearches for the distribution of the model's parameters that could have yielded\nthe data. Inference is challenging for large population studies where millions\nof measurements are performed over a cohort of hundreds of subjects, resulting\nin a massive parameter space. This large cardinality renders off-the-shelf\nVariational Inference (VI) computationally impractical.\n In this work, we design structured VI families that efficiently tackle large\npopulation studies. Our main idea is to share the parameterization and learning\nacross the different i.i.d. variables in a generative model, symbolized by the\nmodel's \\textit{plates}. We name this concept \\textit{plate amortization}.\nContrary to off-the-shelf stochastic VI, which slows down inference, plate\namortization results in orders of magnitude faster to train variational\ndistributions.\n Applied to large-scale hierarchical problems, PAVI yields expressive,\nparsimoniously parameterized VI with an affordable training time. This faster\nconvergence effectively unlocks inference in those large regimes. We illustrate\nthe practical utility of PAVI through a challenging Neuroimaging example\nfeaturing 400 million latent parameters, demonstrating a significant step\ntowards scalable and expressive Variational Inference.\n","authors":["Louis Rouillard","Alexandre Le Bris","Thomas Moreau","Demian Wassermann"],"pdf_url":"https://arxiv.org/pdf/2308.16022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00501v2","updated":"2023-08-30T13:02:41Z","published":"2023-07-02T07:20:47Z","title":"Classifying World War II Era Ciphers with Machine Learning","summary":" We determine the accuracy with which machine learning and deep learning\ntechniques can classify selected World War II era ciphers when only ciphertext\nis available. The specific ciphers considered are Enigma, M-209, Sigaba,\nPurple, and Typex. We experiment with three classic machine learning models,\nnamely, Support Vector Machines (SVM), $k$-Nearest Neighbors ($k$-NN), and\nRandom Forest (RF). We also experiment with four deep learning neural\nnetwork-based models: Multi-Layer Perceptrons (MLP), Long Short-Term Memory\n(LSTM), Extreme Learning Machines (ELM), and Convolutional Neural Networks\n(CNN). Each model is trained on features consisting of histograms, digrams, and\nraw ciphertext letter sequences. Furthermore, the classification problem is\nconsidered under four distinct scenarios: Fixed plaintext with fixed keys,\nrandom plaintext with fixed keys, fixed plaintext with random keys, and random\nplaintext with random keys. Under the most realistic scenario, given 1000\ncharacters per ciphertext, we are able to distinguish the ciphers with greater\nthan 97% accuracy. In addition, we consider the accuracy of a subset of the\nlearning techniques as a function of the length of the ciphertext messages.\nSomewhat surprisingly, our classic machine learning models perform at least as\nwell as our deep learning models. We also find that ciphers that are more\nsimilar in design are somewhat more challenging to distinguish, but not as\ndifficult as might be expected.\n","authors":["Brooke Dalton","Mark Stamp"],"pdf_url":"https://arxiv.org/pdf/2307.00501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16008v1","updated":"2023-08-30T12:55:02Z","published":"2023-08-30T12:55:02Z","title":"EnsembleFollower: A Hybrid Car-Following Framework Based On\n Reinforcement Learning and Hierarchical Planning","summary":" Car-following models have made significant contributions to our understanding\nof longitudinal driving behavior. However, they often exhibit limited accuracy\nand flexibility, as they cannot fully capture the complexity inherent in\ncar-following processes, or may falter in unseen scenarios due to their\nreliance on confined driving skills present in training data. It is worth\nnoting that each car-following model possesses its own strengths and weaknesses\ndepending on specific driving scenarios. Therefore, we propose\nEnsembleFollower, a hierarchical planning framework for achieving advanced\nhuman-like car-following. The EnsembleFollower framework involves a high-level\nReinforcement Learning-based agent responsible for judiciously managing\nmultiple low-level car-following models according to the current state, either\nby selecting an appropriate low-level model to perform an action or by\nallocating different weights across all low-level components. Moreover, we\npropose a jerk-constrained kinematic model for more convincing car-following\nsimulations. We evaluate the proposed method based on real-world driving data\nfrom the HighD dataset. The experimental results illustrate that\nEnsembleFollower yields improved accuracy of human-like behavior and achieves\neffectiveness in combining hybrid models, demonstrating that our proposed\nframework can handle diverse car-following conditions by leveraging the\nstrengths of various low-level models.\n","authors":["Xu Han","Xianda Chen","Meixin Zhu","Pinlong Cai","Jianshan Zhou","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2308.16008v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.15987v1","updated":"2023-08-30T12:18:18Z","published":"2023-08-30T12:18:18Z","title":"FPTQ: Fine-grained Post-Training Quantization for Large Language Models","summary":" In the era of large-scale language models, the substantial parameter size\nposes significant challenges for deployment. Being a prevalent compression\ntechnique, quantization has emerged as the mainstream practice to tackle this\nissue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and\nactivations in such bit widths). In this study, we propose a novel W4A8\npost-training quantization method for the available open-sourced LLMs, which\ncombines the advantages of both two recipes. Therefore, we can leverage the\nbenefit in the I/O utilization of 4-bit weight quantization and the\nacceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces\nnotorious performance degradation. As a remedy, we involve layerwise activation\nquantization strategies which feature a novel logarithmic equalization for most\nintractable layers, and we combine them with fine-grained weight quantization.\nWithout whistles and bells, we eliminate the necessity for further fine-tuning\nand obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and\nLLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is\nachievable for the deployment of large language models, fostering their\nwide-spreading real-world applications.\n","authors":["Qingyuan Li","Yifan Zhang","Liang Li","Peng Yao","Bo Zhang","Xiangxiang Chu","Yerui Sun","Li Du","Yuchen Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15984v1","updated":"2023-08-30T12:13:13Z","published":"2023-08-30T12:13:13Z","title":"Learning Structure-from-Motion with Graph Attention Networks","summary":" In this paper we tackle the problem of learning Structure-from-Motion (SfM)\nthrough the use of graph attention networks. SfM is a classic computer vision\nproblem that is solved though iterative minimization of reprojection errors,\nreferred to as Bundle Adjustment (BA), starting from a good initialization. In\norder to obtain a good enough initialization to BA, conventional methods rely\non a sequence of sub-problems (such as pairwise pose estimation, pose averaging\nor triangulation) which provides an initial solution that can then be refined\nusing BA. In this work we replace these sub-problems by learning a model that\ntakes as input the 2D keypoints detected across multiple views, and outputs the\ncorresponding camera poses and 3D keypoint coordinates. Our model takes\nadvantage of graph neural networks to learn SfM-specific primitives, and we\nshow that it can be used for fast inference of the reconstruction for new and\nunseen sequences. The experimental results show that the proposed model\noutperforms competing learning-based methods, and challenges COLMAP while\nhaving lower runtime.\n","authors":["Lucas Brynte","José Pedro Iglesias","Carl Olsson","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2308.15984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06154v2","updated":"2023-08-30T12:07:02Z","published":"2023-06-09T14:49:20Z","title":"HypLL: The Hyperbolic Learning Library","summary":" Deep learning in hyperbolic space is quickly gaining traction in the fields\nof machine learning, multimedia, and computer vision. Deep networks commonly\noperate in Euclidean space, implicitly assuming that data lies on regular\ngrids. Recent advances have shown that hyperbolic geometry provides a viable\nalternative foundation for deep learning, especially when data is hierarchical\nin nature and when working with few embedding dimensions. Currently however, no\naccessible open-source library exists to build hyperbolic network modules akin\nto well-known deep learning libraries. We present HypLL, the Hyperbolic\nLearning Library to bring the progress on hyperbolic deep learning together.\nHypLL is built on top of PyTorch, with an emphasis in its design for\nease-of-use, in order to attract a broad audience towards this new and\nopen-ended research direction. The code is available at:\nhttps://github.com/maxvanspengler/hyperbolic_learning_library.\n","authors":["Max van Spengler","Philipp Wirth","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2306.06154v2.pdf","comment":"ACM Multimedia Open-Source Software Competition 2023"},{"id":"http://arxiv.org/abs/2308.14521v2","updated":"2023-08-30T11:56:45Z","published":"2023-08-28T12:13:36Z","title":"Context-Aware Composition of Agent Policies by Markov Decision Process\n Entity Embeddings and Agent Ensembles","summary":" Computational agents support humans in many areas of life and are therefore\nfound in heterogeneous contexts. This means they operate in rapidly changing\nenvironments and can be confronted with huge state and action spaces. In order\nto perform services and carry out activities in a goal-oriented manner, agents\nrequire prior knowledge and therefore have to develop and pursue\ncontext-dependent policies. However, prescribing policies in advance is limited\nand inflexible, especially in dynamically changing environments. Moreover, the\ncontext of an agent determines its choice of actions. Since the environments\ncan be stochastic and complex in terms of the number of states and feasible\nactions, activities are usually modelled in a simplified way by Markov decision\nprocesses so that, e.g., agents with reinforcement learning are able to learn\npolicies, that help to capture the context and act accordingly to optimally\nperform activities. However, training policies for all possible contexts using\nreinforcement learning is time-consuming. A requirement and challenge for\nagents is to learn strategies quickly and respond immediately in cross-context\nenvironments and applications, e.g., the Internet, service robotics,\ncyber-physical systems. In this work, we propose a novel simulation-based\napproach that enables a) the representation of heterogeneous contexts through\nknowledge graphs and entity embeddings and b) the context-aware composition of\npolicies on demand by ensembles of agents running in parallel. The evaluation\nwe conducted with the \"Virtual Home\" dataset indicates that agents with a need\nto switch seamlessly between different contexts, can request on-demand composed\npolicies that lead to the successful completion of context-appropriate\nactivities without having to learn these policies in lengthy training steps and\nepisodes, in contrast to agents that use reinforcement learning.\n","authors":["Nicole Merkle","Ralf Mikut"],"pdf_url":"https://arxiv.org/pdf/2308.14521v2.pdf","comment":"30 pages, 11 figures, 9 tables, 3 listings, Re-submitted to Semantic\n Web Journal, Currently, under review"},{"id":"http://arxiv.org/abs/2308.15973v1","updated":"2023-08-30T11:51:38Z","published":"2023-08-30T11:51:38Z","title":"Demo: A Digital Twin of the 5G Radio Access Network for Anomaly\n Detection Functionality","summary":" Recently, the concept of digital twins (DTs) has received significant\nattention within the realm of 5G/6G. This demonstration shows an innovative DT\ndesign and implementation framework tailored toward integration within the 5G\ninfrastructure. The proposed DT enables near real-time anomaly detection\ncapability pertaining to user connectivity. It empowers the 5G system to\nproactively execute decisions for resource control and connection restoration.\n","authors":["Peizheng Li","Adnan Aijaz","Tim Farnham","Sajida Gufran","Sita Chintalapati"],"pdf_url":"https://arxiv.org/pdf/2308.15973v1.pdf","comment":"2 pages, 2 figures. This paper has been accepted by the 31st IEEE\n International Conference on Network Protocols (ICNP 2023)"},{"id":"http://arxiv.org/abs/2308.02562v2","updated":"2023-08-30T11:47:05Z","published":"2023-08-03T04:03:46Z","title":"Food Classification using Joint Representation of Visual and Textual\n Data","summary":" Food classification is an important task in health care. In this work, we\npropose a multimodal classification framework that uses the modified version of\nEfficientNet with the Mish activation function for image classification, and\nthe traditional BERT transformer-based network is used for text classification.\nThe proposed network and the other state-of-the-art methods are evaluated on a\nlarge open-source dataset, UPMC Food-101. The experimental results show that\nthe proposed network outperforms the other methods, a significant difference of\n11.57% and 6.34% in accuracy is observed for image and text classification,\nrespectively, when compared with the second-best performing method. We also\ncompared the performance in terms of accuracy, precision, and recall for text\nclassification using both machine learning and deep learning-based models. The\ncomparative analysis from the prediction results of both images and text\ndemonstrated the efficiency and robustness of the proposed approach.\n","authors":["Prateek Mittal","Puneet Goyal","Joohi Chauhan"],"pdf_url":"https://arxiv.org/pdf/2308.02562v2.pdf","comment":"Updated results and discussions to be posted and some sections needed\n to be expanded"},{"id":"http://arxiv.org/abs/2204.07000v2","updated":"2023-08-30T11:05:50Z","published":"2022-04-14T14:49:34Z","title":"Solving AC Power Flow with Graph Neural Networks under Realistic\n Constraints","summary":" In this paper, we propose a graph neural network architecture to solve the AC\npower flow problem under realistic constraints. To ensure a safe and resilient\noperation of distribution grids, AC power flow calculations are the means of\nchoice to determine grid operating limits or analyze grid asset utilization in\nplanning procedures. In our approach, we demonstrate the development of a\nframework that uses graph neural networks to learn the physical constraints of\nthe power flow. We present our model architecture on which we perform\nunsupervised training to learn a general solution of the AC power flow\nformulation independent of the specific topologies and supply tasks used for\ntraining. Finally, we demonstrate, validate and discuss our results on medium\nvoltage benchmark grids. In our approach, we focus on the physical and\ntopological properties of distribution grids to provide scalable solutions for\nreal grid topologies. Therefore, we take a data-driven approach, using large\nand diverse data sets consisting of realistic grid topologies, for the\nunsupervised training of the AC power flow graph neural network architecture\nand compare the results to a prior neural architecture and the Newton-Raphson\nmethod. Our approach shows a high increase in computation time and good\naccuracy compared to state-of-the-art solvers. It also out-performs that neural\nsolver for power flow in terms of accuracy.\n","authors":["Luis Böttcher","Hinrikus Wolf","Bastian Jung","Philipp Lutat","Marc Trageser","Oliver Pohl","Andreas Ulbig","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2204.07000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15428v2","updated":"2023-08-30T10:38:41Z","published":"2023-07-28T09:26:00Z","title":"Implicit neural representation for change detection","summary":" Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained\nduring two distinct time periods over the same geographic region presents a\nsignificant challenge due to the disparities in spatial coverage and the\npresence of noise in the acquisition system. The most commonly used approaches\nto detecting changes in point clouds are based on supervised methods which\nnecessitate extensive labelled data often unavailable in real-world\napplications. To address these issues, we propose an unsupervised approach that\ncomprises two components: Implicit Neural Representation (INR) for continuous\nshape reconstruction and a Gaussian Mixture Model for categorising changes. INR\noffers a grid-agnostic representation for encoding bi-temporal point clouds,\nwith unmatched spatial support that can be regularised to enhance\nhigh-frequency details and reduce noise. The reconstructions at each timestamp\nare compared at arbitrary spatial scales, leading to a significant increase in\ndetection capabilities. We apply our method to a benchmark dataset comprising\nsimulated LiDAR point clouds for urban sprawling. This dataset encompasses\ndiverse challenging scenarios, varying in resolutions, input modalities and\nnoise levels. This enables a comprehensive multi-scenario evaluation, comparing\nour method with the current state-of-the-art approach. We outperform the\nprevious methods by a margin of 10% in the intersection over union metric. In\naddition, we put our techniques to practical use by applying them in a\nreal-world scenario to identify instances of illicit excavation of\narchaeological sites and validate our results by comparing them with findings\nfrom field experts.\n","authors":["Peter Naylor","Diego Di Carlo","Arianna Traviglia","Makoto Yamada","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.15428v2.pdf","comment":"Main article is 10 pages + 6 pages of supplementary. Conference style\n paper"},{"id":"http://arxiv.org/abs/2308.15936v1","updated":"2023-08-30T10:33:02Z","published":"2023-08-30T10:33:02Z","title":"Jaccard-constrained dense subgraph discovery","summary":" Finding dense subgraphs is a core problem in graph mining with many\napplications in diverse domains. At the same time many real-world networks vary\nover time, that is, the dataset can be represented as a sequence of graph\nsnapshots. Hence, it is natural to consider the question of finding dense\nsubgraphs in a temporal network that are allowed to vary over time to a certain\ndegree. In this paper, we search for dense subgraphs that have large pairwise\nJaccard similarity coefficients. More formally, given a set of graph snapshots\nand a weight $\\lambda$, we find a collection of dense subgraphs such that the\nsum of densities of the induced subgraphs plus the sum of Jaccard indices,\nweighted by $\\lambda$, is maximized. We prove that this problem is NP-hard. To\ndiscover dense subgraphs with good objective value, we present an iterative\nalgorithm which runs in $\\mathcal{O}(n^2k^2 + m \\log n + k^3 n)$ time per\nsingle iteration, and a greedy algorithm which runs in $\\mathcal{O}(n^2k^2 + m\n\\log n + k^3 n)$ time, where $k$ is the length of the graph sequence and $n$\nand $m$ denote number of nodes and total number of edges respectively. We show\nexperimentally that our algorithms are efficient, they can find ground truth in\nsynthetic datasets and provide interpretable results from real-world datasets.\nFinally, we present a case study that shows the usefulness of our problem.\n","authors":["Chamalee Wickrama Arachchi","Nikolaj Tatti"],"pdf_url":"https://arxiv.org/pdf/2308.15936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13455v3","updated":"2023-08-30T10:27:00Z","published":"2022-10-21T09:59:15Z","title":"E-MCTS: Deep Exploration in Model-Based Reinforcement Learning by\n Planning with Epistemic Uncertainty","summary":" One of the most well-studied and highly performing planning approaches used\nin Model-Based Reinforcement Learning (MBRL) is Monte-Carlo Tree Search (MCTS).\nKey challenges of MCTS-based MBRL methods remain dedicated deep exploration and\nreliability in the face of the unknown, and both challenges can be alleviated\nthrough principled epistemic uncertainty estimation in the predictions of MCTS.\nWe present two main contributions: First, we develop methodology to propagate\nepistemic uncertainty in MCTS, enabling agents to estimate the epistemic\nuncertainty in their predictions. Second, we utilize the propagated uncertainty\nfor a novel deep exploration algorithm by explicitly planning to explore. We\nincorporate our approach into variations of MCTS-based MBRL approaches with\nlearned and provided dynamics models, and empirically show deep exploration\nthrough successful epistemic uncertainty estimation achieved by our approach.\nWe compare to a non-planning-based deep-exploration baseline, and demonstrate\nthat planning with epistemic MCTS significantly outperforms non-planning based\nexploration in the investigated deep exploration benchmark.\n","authors":["Yaniv Oren","Matthijs T. J. Spaan","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2210.13455v3.pdf","comment":"Submitted to NeurIPS 2023, accepted to EWRL 2023"},{"id":"http://arxiv.org/abs/2307.09829v2","updated":"2023-08-30T10:19:02Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n shortcut perspective","summary":" Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v2.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2210.17287v3","updated":"2023-08-30T10:18:25Z","published":"2022-10-27T10:46:32Z","title":"Diffiner: A Versatile Diffusion-based Generative Refiner for Speech\n Enhancement","summary":" Although deep neural network (DNN)-based speech enhancement (SE) methods\noutperform the previous non-DNN-based ones, they often degrade the perceptual\nquality of generated outputs. To tackle this problem, we introduce a DNN-based\ngenerative refiner, Diffiner, aiming to improve perceptual speech quality\npre-processed by an SE method. We train a diffusion-based generative model by\nutilizing a dataset consisting of clean speech only. Then, our refiner\neffectively mixes clean parts newly generated via denoising diffusion\nrestoration into the degraded and distorted parts caused by a preceding SE\nmethod, resulting in refined speech. Once our refiner is trained on a set of\nclean speech, it can be applied to various SE methods without additional\ntraining specialized for each SE module. Therefore, our refiner can be a\nversatile post-processing module w.r.t. SE methods and has high potential in\nterms of modularity. Experimental results show that our method improved\nperceptual speech quality regardless of the preceding SE methods used.\n","authors":["Ryosuke Sawata","Naoki Murata","Yuhta Takida","Toshimitsu Uesaka","Takashi Shibuya","Shusuke Takahashi","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2210.17287v3.pdf","comment":"Accepted by Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.15930v1","updated":"2023-08-30T10:12:39Z","published":"2023-08-30T10:12:39Z","title":"LLaSM: Large Language and Speech Model","summary":" Multi-modal large language models have garnered significant interest\nrecently. Though, most of the works focus on vision-language multi-modal models\nproviding strong capabilities in following vision-and-language instructions.\nHowever, we claim that speech is also an important modality through which\nhumans interact with the world. Hence, it is crucial for a general-purpose\nassistant to be able to follow multi-modal speech-and-language instructions. In\nthis work, we propose Large Language and Speech Model (LLaSM). LLaSM is an\nend-to-end trained large multi-modal speech-language model with cross-modal\nconversational abilities, capable of following speech-and-language\ninstructions. Our early experiments show that LLaSM demonstrates a more\nconvenient and natural way for humans to interact with artificial intelligence.\nSpecifically, we also release a large Speech Instruction Following dataset\nLLaSM-Audio-Instructions. Code and demo are available at\nhttps://github.com/LinkSoul-AI/LLaSM and\nhttps://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions\ndataset is available at\nhttps://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions.\n","authors":["Yu Shu","Siwei Dong","Guangyao Chen","Wenhao Huang","Ruihua Zhang","Daochen Shi","Qiqi Xiang","Yemin Shi"],"pdf_url":"https://arxiv.org/pdf/2308.15930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11141v2","updated":"2023-08-30T09:57:57Z","published":"2022-07-22T15:25:59Z","title":"Deep neural networks on diffeomorphism groups for optimal shape\n reparameterization","summary":" One of the fundamental problems in shape analysis is to align curves or\nsurfaces before computing geodesic distances between their shapes. Finding the\noptimal reparametrization realizing this alignment is a computationally\ndemanding task, typically done by solving an optimization problem on the\ndiffeomorphism group. In this paper, we propose an algorithm for constructing\napproximations of orientation-preserving diffeomorphisms by composition of\nelementary diffeomorphisms. The algorithm is implemented using PyTorch, and is\napplicable for both unparametrized curves and surfaces. Moreover, we show\nuniversal approximation properties for the constructed architectures, and\nobtain bounds for the Lipschitz constants of the resulting diffeomorphisms.\n","authors":["Elena Celledoni","Helge Glöckner","Jørgen Riseth","Alexander Schmeding"],"pdf_url":"https://arxiv.org/pdf/2207.11141v2.pdf","comment":"36 pages, 11 figures. Accepted by BIT Numerical Mathematics, not yet\n published"},{"id":"http://arxiv.org/abs/2308.15911v1","updated":"2023-08-30T09:38:44Z","published":"2023-08-30T09:38:44Z","title":"Cyclophobic Reinforcement Learning","summary":" In environments with sparse rewards, finding a good inductive bias for\nexploration is crucial to the agent's success. However, there are two competing\ngoals: novelty search and systematic exploration. While existing approaches\nsuch as curiosity-driven exploration find novelty, they sometimes do not\nsystematically explore the whole state space, akin to depth-first-search vs\nbreadth-first-search. In this paper, we propose a new intrinsic reward that is\ncyclophobic, i.e., it does not reward novelty, but punishes redundancy by\navoiding cycles. Augmenting the cyclophobic intrinsic reward with a sequence of\nhierarchical representations based on the agent's cropped observations we are\nable to achieve excellent results in the MiniGrid and MiniHack environments.\nBoth are particularly hard, as they require complex interactions with different\nobjects in order to be solved. Detailed comparisons with previous approaches\nand thorough ablation studies show that our newly proposed cyclophobic\nreinforcement learning is more sample efficient than other state of the art\nmethods in a variety of tasks.\n","authors":["Stefan Sylvius Wagner","Peter Arndt","Jan Robine","Stefan Harmeling"],"pdf_url":"https://arxiv.org/pdf/2308.15911v1.pdf","comment":"Published in Transactions on Machine Learning Research (08/2023)"},{"id":"http://arxiv.org/abs/2210.10264v3","updated":"2023-08-30T09:19:01Z","published":"2022-10-19T02:59:31Z","title":"SignReLU neural network and its approximation ability","summary":" Deep neural networks (DNNs) have garnered significant attention in various\nfields of science and technology in recent years. Activation functions define\nhow neurons in DNNs process incoming signals for them. They are essential for\nlearning non-linear transformations and for performing diverse computations\namong successive neuron layers. In the last few years, researchers have\ninvestigated the approximation ability of DNNs to explain their power and\nsuccess. In this paper, we explore the approximation ability of DNNs using a\ndifferent activation function, called SignReLU. Our theoretical results\ndemonstrate that SignReLU networks outperform rational and ReLU networks in\nterms of approximation performance. Numerical experiments are conducted\ncomparing SignReLU with the existing activations such as ReLU, Leaky ReLU, and\nELU, which illustrate the competitive practical performance of SignReLU.\n","authors":["Jianfei Li","Han Feng","Ding-Xuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2210.10264v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15905v1","updated":"2023-08-30T09:15:41Z","published":"2023-08-30T09:15:41Z","title":"Thermodynamic Computing via Autonomous Quantum Thermal Machines","summary":" We develop a physics-based model for classical computation based on\nautonomous quantum thermal machines. These machines consist of few interacting\nquantum bits (qubits) connected to several environments at different\ntemperatures. Heat flows through the machine are here exploited for computing.\nThe process starts by setting the temperatures of the environments according to\nthe logical input. The machine evolves, eventually reaching a non-equilibrium\nsteady state, from which the output of the computation can be determined via\nthe temperature of an auxilliary finite-size reservoir. Such a machine, which\nwe term a \"thermodynamic neuron\", can implement any linearly-separable\nfunction, and we discuss explicitly the cases of NOT, 3-majority and NOR gates.\nIn turn, we show that a network of thermodynamic neurons can perform any\ndesired function. We discuss the close connection between our model and\nartificial neurons (perceptrons), and argue that our model provides an\nalternative physics-based analogue implementation of neural networks, and more\ngenerally a platform for thermodynamic computing.\n","authors":["Patryk Lipka-Bartosik","Martí Perarnau-Llobet","Nicolas Brunner"],"pdf_url":"https://arxiv.org/pdf/2308.15905v1.pdf","comment":"12 + 4 pages. Comments welcome!"},{"id":"http://arxiv.org/abs/2308.15899v1","updated":"2023-08-30T09:09:42Z","published":"2023-08-30T09:09:42Z","title":"Beyond Traditional Neural Networks: Toward adding Reasoning and Learning\n Capabilities through Computational Logic Techniques","summary":" Deep Learning (DL) models have become popular for solving complex problems,\nbut they have limitations such as the need for high-quality training data, lack\nof transparency, and robustness issues. Neuro-Symbolic AI has emerged as a\npromising approach combining the strengths of neural networks and symbolic\nreasoning. Symbolic knowledge injection (SKI) techniques are a popular method\nto incorporate symbolic knowledge into sub-symbolic systems. This work proposes\nsolutions to improve the knowledge injection process and integrate elements of\nML and logic into multi-agent systems (MAS).\n","authors":["Andrea Rafanelli"],"pdf_url":"https://arxiv.org/pdf/2308.15899v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15887v1","updated":"2023-08-30T09:04:24Z","published":"2023-08-30T09:04:24Z","title":"On the Potential of CLIP for Compositional Logical Reasoning","summary":" In this paper we explore the possibility of using OpenAI's CLIP to perform\nlogically coherent grounded visual reasoning. To that end, we formalize our\nterms and give a geometric analysis of how embeddings in CLIP's latent space\nwould need to be configured in order for the system to be logically coherent.\nOur main conclusion is that, as usually configured, CLIP cannot perform such\nreasoning.\n","authors":["Justin Brody"],"pdf_url":"https://arxiv.org/pdf/2308.15887v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15885v1","updated":"2023-08-30T09:04:06Z","published":"2023-08-30T09:04:06Z","title":"Towards One-Shot Learning for Text Classification using Inductive Logic\n Programming","summary":" With the ever-increasing potential of AI to perform personalised tasks, it is\nbecoming essential to develop new machine learning techniques which are\ndata-efficient and do not require hundreds or thousands of training data. In\nthis paper, we explore an Inductive Logic Programming approach for one-shot\ntext classification. In particular, we explore the framework of\nMeta-Interpretive Learning (MIL), along with using common-sense background\nknowledge extracted from ConceptNet. Results indicate that MIL can learn text\nclassification rules from a small number of training examples. Moreover, the\nhigher complexity of chosen examples, the higher accuracy of the outcome.\n","authors":["Ghazal Afroozi Milani","Daniel Cyrus","Alireza Tamaddoni-Nezhad"],"pdf_url":"https://arxiv.org/pdf/2308.15885v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15883v1","updated":"2023-08-30T09:03:45Z","published":"2023-08-30T09:03:45Z","title":"\"Would life be more interesting if I were in AI?\" Answering\n Counterfactuals based on Probabilistic Inductive Logic Programming","summary":" Probabilistic logic programs are logic programs where some facts hold with a\nspecified probability. Here, we investigate these programs with a causal\nframework that allows counterfactual queries. Learning the program structure\nfrom observational data is usually done through heuristic search relying on\nstatistical tests. However, these statistical tests lack information about the\ncausal mechanism generating the data, which makes it unfeasible to use the\nresulting programs for counterfactual reasoning. To address this, we propose a\nlanguage fragment that allows reconstructing a program from its induced\ndistribution. This further enables us to learn programs supporting\ncounterfactual queries.\n","authors":["Kilian Rückschloß","Felix Weitkämper"],"pdf_url":"https://arxiv.org/pdf/2308.15883v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15873v1","updated":"2023-08-30T08:58:23Z","published":"2023-08-30T08:58:23Z","title":"Minimum Width for Deep, Narrow MLP: A Diffeomorphism and the Whitney\n Embedding Theorem Approach","summary":" Recently, there has been significant attention on determining the minimum\nwidth for the universal approximation property of deep, narrow MLPs. Among\nthese challenges, approximating a continuous function under the uniform norm is\nimportant and challenging, with the gap between its lower and upper bound being\nhard to narrow. In this regard, we propose a novel upper bound for the minimum\nwidth, given by $\\operatorname{max}(2d_x+1, d_y) + \\alpha(\\sigma)$, to achieve\nuniform approximation in deep narrow MLPs, where $0\\leq \\alpha(\\sigma)\\leq 2$\nrepresents the constant depending on the activation function. We demonstrate\nthis bound through two key proofs. First, we establish that deep, narrow MLPs\nwith little additional width can approximate diffeomorphisms. Secondly, we\nutilize the Whitney embedding theorem to show that any continuous function can\nbe approximated by embeddings, further decomposed into linear transformations\nand diffeomorphisms.\n","authors":["Geonho Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15856v1","updated":"2023-08-30T08:46:46Z","published":"2023-08-30T08:46:46Z","title":"Domain Generalization without Excess Empirical Risk","summary":" Given data from diverse sets of distinct distributions, domain generalization\naims to learn models that generalize to unseen distributions. A common approach\nis designing a data-driven surrogate penalty to capture generalization and\nminimize the empirical risk jointly with the penalty. We argue that a\nsignificant failure mode of this recipe is an excess risk due to an erroneous\npenalty or hardness in joint optimization. We present an approach that\neliminates this problem. Instead of jointly minimizing empirical risk with the\npenalty, we minimize the penalty under the constraint of optimality of the\nempirical risk. This change guarantees that the domain generalization penalty\ncannot impair optimization of the empirical risk, i.e., in-distribution\nperformance. To solve the proposed optimization problem, we demonstrate an\nexciting connection to rate-distortion theory and utilize its tools to design\nan efficient method. Our approach can be applied to any penalty-based domain\ngeneralization method, and we demonstrate its effectiveness by applying it to\nthree examplar methods from the literature, showing significant improvements.\n","authors":["Ozan Sener","Vladlen Koltun"],"pdf_url":"https://arxiv.org/pdf/2308.15856v1.pdf","comment":"Published at NeurIPS 2022"},{"id":"http://arxiv.org/abs/2302.08811v2","updated":"2023-08-30T08:23:19Z","published":"2023-02-17T11:09:59Z","title":"G-Signatures: Global Graph Propagation With Randomized Signatures","summary":" Graph neural networks (GNNs) have evolved into one of the most popular deep\nlearning architectures. However, GNNs suffer from over-smoothing node\ninformation and, therefore, struggle to solve tasks where global graph\nproperties are relevant. We introduce G-Signatures, a novel graph learning\nmethod that enables global graph propagation via randomized signatures.\nG-Signatures use a new graph conversion concept to embed graph structured\ninformation which can be interpreted as paths in latent space. We further\nintroduce the idea of latent space path mapping. This allows us to iteratively\ntraverse latent space paths, and, thus globally process information.\nG-Signatures excel at extracting and processing global graph properties, and\neffectively scale to large graph problems. Empirically, we confirm the\nadvantages of G-Signatures at several classification and regression tasks.\n","authors":["Bernhard Schäfl","Lukas Gruber","Johannes Brandstetter","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2302.08811v2.pdf","comment":"7 pages (+ appendix); 4 figures"},{"id":"http://arxiv.org/abs/2308.15840v1","updated":"2023-08-30T08:21:56Z","published":"2023-08-30T08:21:56Z","title":"MSGNN: Multi-scale Spatio-temporal Graph Neural Network for Epidemic\n Forecasting","summary":" Infectious disease forecasting has been a key focus and proved to be crucial\nin controlling epidemic. A recent trend is to develop forecast-ing models based\non graph neural networks (GNNs). However, existing GNN-based methods suffer\nfrom two key limitations: (1) Current models broaden receptive fields by\nscaling the depth of GNNs, which is insuffi-cient to preserve the semantics of\nlong-range connectivity between distant but epidemic related areas. (2)\nPrevious approaches model epidemics within single spatial scale, while ignoring\nthe multi-scale epidemic pat-terns derived from different scales. To address\nthese deficiencies, we devise the Multi-scale Spatio-temporal Graph Neural\nNetwork (MSGNN) based on an innovative multi-scale view. To be specific, in the\nproposed MSGNN model, we first devise a novel graph learning module, which\ndirectly captures long-range connectivity from trans-regional epidemic signals\nand integrates them into a multi-scale graph. Based on the learned multi-scale\ngraph, we utilize a newly designed graph convolution module to exploit\nmulti-scale epidemic patterns. This module allows us to facilitate multi-scale\nepidemic modeling by mining both scale-shared and scale-specific pat-terns.\nExperimental results on forecasting new cases of COVID-19 in United State\ndemonstrate the superiority of our method over state-of-arts. Further analyses\nand visualization also show that MSGNN offers not only accurate, but also\nrobust and interpretable forecasting result.\n","authors":["Mingjie Qiu","Zhiyi Tan","Bing-kun Bao"],"pdf_url":"https://arxiv.org/pdf/2308.15840v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.15838v1","updated":"2023-08-30T08:21:46Z","published":"2023-08-30T08:21:46Z","title":"Adaptive Lasso, Transfer Lasso, and Beyond: An Asymptotic Perspective","summary":" This paper presents a comprehensive exploration of the theoretical properties\ninherent in the Adaptive Lasso and the Transfer Lasso. The Adaptive Lasso, a\nwell-established method, employs regularization divided by initial estimators\nand is characterized by asymptotic normality and variable selection\nconsistency. In contrast, the recently proposed Transfer Lasso employs\nregularization subtracted by initial estimators with the demonstrated capacity\nto curtail non-asymptotic estimation errors. A pivotal question thus emerges:\nGiven the distinct ways the Adaptive Lasso and the Transfer Lasso employ\ninitial estimators, what benefits or drawbacks does this disparity confer upon\neach method? This paper conducts a theoretical examination of the asymptotic\nproperties of the Transfer Lasso, thereby elucidating its differentiation from\nthe Adaptive Lasso. Informed by the findings of this analysis, we introduce a\nnovel method, one that amalgamates the strengths and compensates for the\nweaknesses of both methods. The paper concludes with validations of our theory\nand comparisons of the methods via simulation experiments.\n","authors":["Masaaki Takada","Hironori Fujisawa"],"pdf_url":"https://arxiv.org/pdf/2308.15838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11479v2","updated":"2023-08-30T08:21:40Z","published":"2023-01-27T00:51:48Z","title":"Alien Coding","summary":" We introduce a self-learning algorithm for synthesizing programs for OEIS\nsequences. The algorithm starts from scratch initially generating programs at\nrandom. Then it runs many iterations of a self-learning loop that interleaves\n(i) training neural machine translation to learn the correspondence between\nsequences and the programs discovered so far, and (ii) proposing many new\nprograms for each OEIS sequence by the trained neural machine translator. The\nalgorithm discovers on its own programs for more than 78000 OEIS sequences,\nsometimes developing unusual programming methods. We analyze its behavior and\nthe invented programs in several experiments.\n","authors":["Thibault Gauthier","Miroslav Olšák","Josef Urban"],"pdf_url":"https://arxiv.org/pdf/2301.11479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v2","updated":"2023-08-30T08:20:30Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v2.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2204.09398v2","updated":"2023-08-30T08:18:15Z","published":"2022-04-20T11:43:58Z","title":"Case-Aware Adversarial Training","summary":" The neural network (NN) becomes one of the most heated type of models in\nvarious signal processing applications. However, NNs are extremely vulnerable\nto adversarial examples (AEs). To defend AEs, adversarial training (AT) is\nbelieved to be the most effective method while due to the intensive\ncomputation, AT is limited to be applied in most applications. In this paper,\nto resolve the problem, we design a generic and efficient AT improvement\nscheme, namely case-aware adversarial training (CAT). Specifically, the\nintuition stems from the fact that a very limited part of informative samples\ncan contribute to most of model performance. Alternatively, if only the most\ninformative AEs are used in AT, we can lower the computation complexity of AT\nsignificantly as maintaining the defense effect. To achieve this, CAT achieves\ntwo breakthroughs. First, a method to estimate the information degree of\nadversarial examples is proposed for AE filtering. Second, to further enrich\nthe information that the NN can obtain from AEs, CAT involves a weight\nestimation and class-level balancing based sampling strategy to increase the\ndiversity of AT at each iteration. Extensive experiments show that CAT is\nfaster than vanilla AT by up to 3x while achieving competitive defense effect.\n","authors":["Mingyuan Fan","Yang Liu","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2204.09398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14624v2","updated":"2023-08-30T08:01:22Z","published":"2022-09-29T08:38:30Z","title":"Is Complexity Required for Neural Network Pruning? A Case Study on\n Global Magnitude Pruning","summary":" Pruning neural networks has become popular in the last decade when it was\nshown that a large number of weights can be safely removed from modern neural\nnetworks without compromising accuracy. Numerous pruning methods have been\nproposed since then, each claiming to be better than the previous. Many\nstate-of-the-art (SOTA) techniques today rely on complex pruning methodologies\nutilizing importance scores, getting feedback through back-propagation or\nhaving heuristics-based pruning rules amongst others. In this work, we question\nwhether this pattern of introducing complexity is really necessary to achieve\nbetter pruning results. We benchmark these SOTA techniques against a naive\npruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks\nweights in order of their magnitudes and prunes the smallest ones. Hence, in\nits vanilla form, it is one of the simplest pruning techniques. Surprisingly,\nwe find that vanilla Global MP outperforms all the other SOTA techniques and\nachieves a new SOTA result. It also achieves promising performance on FLOPs\nsparsification, which we find is enhanced, when pruning is conducted in a\ngradual fashion. We also find that Global MP is generalizable across tasks,\ndatasets, and models with superior performance. Moreover, a common issue that\nmany pruning algorithms run into at high sparsity rates, namely,\nlayer-collapse, can be easily fixed in Global MP by setting a minimum threshold\nof weights to be retained in each layer. Lastly, unlike many other SOTA\ntechniques, Global MP does not require any additional algorithm specific\nhyper-parameters and is very straightforward to tune and implement. We showcase\nour findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1\nand FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is\navailable at https://github.com/manasgupta-1/GlobalMP.\n","authors":["Manas Gupta","Efe Camci","Vishandi Rudy Keneta","Abhishek Vaidyanathan","Ritwik Kanodia","Chuan-Sheng Foo","Wu Min","Lin Jie"],"pdf_url":"https://arxiv.org/pdf/2209.14624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15821v1","updated":"2023-08-30T07:46:32Z","published":"2023-08-30T07:46:32Z","title":"Federated Two Stage Decoupling With Adaptive Personalization Layers","summary":" Federated learning has gained significant attention due to its groundbreaking\nability to enable distributed learning while maintaining privacy constraints.\nHowever, as a consequence of data heterogeneity among decentralized devices, it\ninherently experiences significant learning degradation and slow convergence\nspeed. Therefore, it is natural to employ the concept of clustering homogeneous\nclients into the same group, allowing only the model weights within each group\nto be aggregated. While most existing clustered federated learning methods\nemploy either model gradients or inference outputs as metrics for client\npartitioning, with the goal of grouping similar devices together, may still\nhave heterogeneity within each cluster. Moreover, there is a scarcity of\nresearch exploring the underlying reasons for determining the appropriate\ntiming for clustering, resulting in the common practice of assigning each\nclient to its own individual cluster, particularly in the context of highly non\nindependent and identically distributed (Non-IID) data. In this paper, we\nintroduce a two-stage decoupling federated learning algorithm with adaptive\npersonalization layers named FedTSDP, where client clustering is performed\ntwice according to inference outputs and model weights, respectively. Hopkins\namended sampling is adopted to determine the appropriate timing for clustering\nand the sampling weight of public unlabeled data. In addition, a simple yet\neffective approach is developed to adaptively adjust the personalization layers\nbased on varying degrees of data skew. Experimental results show that our\nproposed method has reliable performance on both IID and non-IID scenarios.\n","authors":["Hangyu Zhu","Yuxiang Fan","Zhenping Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18415v2","updated":"2023-08-30T07:39:14Z","published":"2023-05-28T18:48:50Z","title":"Geometric Algebra Transformers","summary":" Problems involving geometric data arise in physics, chemistry, robotics,\ncomputer vision, and many other fields. Such data can take numerous forms, such\nas points, direction vectors, translations, or rotations, but to date there is\nno single architecture that can be applied to such a wide variety of geometric\ntypes while respecting their symmetries. In this paper we introduce the\nGeometric Algebra Transformer (GATr), a general-purpose architecture for\ngeometric data. GATr represents inputs, outputs, and hidden states in the\nprojective geometric (or Clifford) algebra, which offers an efficient\n16-dimensional vector-space representation of common geometric objects as well\nas operators acting on them. GATr is equivariant with respect to E(3), the\nsymmetry group of 3D Euclidean space. As a Transformer, GATr is versatile,\nefficient, and scalable. We demonstrate GATr in problems from n-body modeling\nto wall-shear-stress estimation on large arterial meshes to robotic motion\nplanning. GATr consistently outperforms both non-geometric and equivariant\nbaselines in terms of error, data efficiency, and scalability.\n","authors":["Johann Brehmer","Pim de Haan","Sönke Behrends","Taco Cohen"],"pdf_url":"https://arxiv.org/pdf/2305.18415v2.pdf","comment":"v2: more experiments, more baselines"},{"id":"http://arxiv.org/abs/2308.15812v1","updated":"2023-08-30T07:35:32Z","published":"2023-08-30T07:35:32Z","title":"Peering Through Preferences: Unraveling Feedback Acquisition for\n Aligning Large Language Models","summary":" Aligning large language models (LLMs) with human values and intents\ncritically involves the use of human or AI feedback. While dense feedback\nannotations are expensive to acquire and integrate, sparse feedback presents a\nstructural design choice between ratings (e.g., score Response A on a scale of\n1-7) and rankings (e.g., is Response A better than Response B?). In this work,\nwe analyze the effect of this design choice for the alignment and evaluation of\nLLMs. We uncover an inconsistency problem wherein the preferences inferred from\nratings and rankings significantly disagree 60% for both human and AI\nannotators. Our subsequent analysis identifies various facets of annotator\nbiases that explain this phenomena, such as human annotators would rate denser\nresponses higher while preferring accuracy during pairwise judgments. To our\nsurprise, we also observe that the choice of feedback protocol also has a\nsignificant effect on the evaluation of aligned LLMs. In particular, we find\nthat LLMs that leverage rankings data for alignment (say model X) are preferred\nover those that leverage ratings data (say model Y), with a rank-based\nevaluation protocol (is X/Y's response better than reference response?) but not\nwith a rating-based evaluation protocol (score Rank X/Y's response on a scale\nof 1-7). Our findings thus shed light on critical gaps in methods for\nevaluating the real-world utility of language models and their strong\ndependence on the feedback protocol used for alignment. Our code and data are\navailable at https://github.com/Hritikbansal/sparse_feedback.\n","authors":["Hritik Bansal","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2308.15812v1.pdf","comment":"24 pages, 12 Tables, 3 Figures"},{"id":"http://arxiv.org/abs/2212.04614v4","updated":"2023-08-30T07:30:28Z","published":"2022-12-09T00:43:49Z","title":"Is Bio-Inspired Learning Better than Backprop? Benchmarking Bio Learning\n vs. Backprop","summary":" Bio-inspired learning has been gaining popularity recently given that\nBackpropagation (BP) is not considered biologically plausible. Many algorithms\nhave been proposed in the literature which are all more biologically plausible\nthan BP. However, apart from overcoming the biological implausibility of BP, a\nstrong motivation for using Bio-inspired algorithms remains lacking. In this\nstudy, we undertake a holistic comparison of BP vs. multiple Bio-inspired\nalgorithms to answer the question of whether Bio-learning offers additional\nbenefits over BP. We test Bio-algorithms under different design choices such as\naccess to only partial training data, resource constraints in terms of the\nnumber of training epochs, sparsification of the neural network parameters and\naddition of noise to input samples. Through these experiments, we notably find\ntwo key advantages of Bio-algorithms over BP. Firstly, Bio-algorithms perform\nmuch better than BP when the entire training dataset is not supplied. Four of\nthe five Bio-algorithms tested outperform BP by upto 5% accuracy when only 20%\nof the training dataset is available. Secondly, even when the full dataset is\navailable, Bio-algorithms learn much quicker and converge to a stable accuracy\nin far lesser training epochs than BP. Hebbian learning, specifically, is able\nto learn in just 5 epochs compared to around 100 epochs required by BP. These\ninsights present practical reasons for utilising Bio-learning beyond just their\nbiological plausibility and also point towards interesting new directions for\nfuture work on Bio-learning.\n","authors":["Manas Gupta","Sarthak Ketanbhai Modi","Hang Zhang","Joon Hei Lee","Joo Hwee Lim"],"pdf_url":"https://arxiv.org/pdf/2212.04614v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15793v1","updated":"2023-08-30T06:53:24Z","published":"2023-08-30T06:53:24Z","title":"HAlf-MAsked Model for Named Entity Sentiment analysis","summary":" Named Entity Sentiment analysis (NESA) is one of the most actively developing\napplication domains in Natural Language Processing (NLP). Social media NESA is\na significant field of opinion analysis since detecting and tracking sentiment\ntrends in the news flow is crucial for building various analytical systems and\nmonitoring the media image of specific people or companies. In this paper, we\nstudy different transformers-based solutions NESA in RuSentNE-23 evaluation.\nDespite the effectiveness of the BERT-like models, they can still struggle with\ncertain challenges, such as overfitting, which appeared to be the main obstacle\nin achieving high accuracy on the RuSentNE-23 data. We present several\napproaches to overcome this problem, among which there is a novel technique of\nadditional pass over given data with masked entity before making the final\nprediction so that we can combine logits from the model when it knows the exact\nentity it predicts sentiment for and when it does not. Utilizing this\ntechnique, we ensemble multiple BERT- like models trained on different subsets\nof data to improve overall performance. Our proposed model achieves the best\nresult on RuSentNE-23 evaluation data and demonstrates improved consistency in\nentity-level sentiment analysis.\n","authors":["Anton Kabaev","Pavel Podberezko","Andrey Kaznacheev","Sabina Abdullayeva"],"pdf_url":"https://arxiv.org/pdf/2308.15793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.14417v5","updated":"2023-08-30T06:40:56Z","published":"2021-12-29T06:43:29Z","title":"Control Theoretic Analysis of Temporal Difference Learning","summary":" The goal of this manuscript is to conduct a controltheoretic analysis of\nTemporal Difference (TD) learning algorithms. TD-learning serves as a\ncornerstone in the realm of reinforcement learning, offering a methodology for\napproximating the value function associated with a given policy in a Markov\nDecision Process. Despite several existing works that have contributed to the\ntheoretical understanding of TD-learning, it is only in recent years that\nresearchers have been able to establish concrete guarantees on its statistical\nefficiency. In this paper, we introduce a finite-time, control-theoretic\nframework for analyzing TD-learning, leveraging established concepts from the\nfield of linear systems control. Consequently, this paper provides additional\ninsights into the mechanics of TD learning and the broader landscape of\nreinforcement learning, all while employing straightforward analytical tools\nderived from control theory.\n","authors":["Donghwan Lee","Do Wan Kim"],"pdf_url":"https://arxiv.org/pdf/2112.14417v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15786v1","updated":"2023-08-30T06:36:32Z","published":"2023-08-30T06:36:32Z","title":"FedCiR: Client-Invariant Representation Learning for Federated Non-IID\n Features","summary":" Federated learning (FL) is a distributed learning paradigm that maximizes the\npotential of data-driven models for edge devices without sharing their raw\ndata. However, devices often have non-independent and identically distributed\n(non-IID) data, meaning their local data distributions can vary significantly.\nThe heterogeneity in input data distributions across devices, commonly referred\nto as the feature shift problem, can adversely impact the training convergence\nand accuracy of the global model. To analyze the intrinsic causes of the\nfeature shift problem, we develop a generalization error bound in FL, which\nmotivates us to propose FedCiR, a client-invariant representation learning\nframework that enables clients to extract informative and client-invariant\nfeatures. Specifically, we improve the mutual information term between\nrepresentations and labels to encourage representations to carry essential\nclassification knowledge, and diminish the mutual information term between the\nclient set and representations conditioned on labels to promote representations\nof clients to be client-invariant. We further incorporate two regularizers into\nthe FL framework to bound the mutual information terms with an approximate\nglobal representation distribution to compensate for the absence of the\nground-truth global representation distribution, thus achieving informative and\nclient-invariant feature extraction. To achieve global representation\ndistribution approximation, we propose a data-free mechanism performed by the\nserver without compromising privacy. Extensive experiments demonstrate the\neffectiveness of our approach in achieving client-invariant representation\nlearning and solving the data heterogeneity issue.\n","authors":["Zijian Li","Zehong Lin","Jiawei Shao","Yuyi Mao","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15783v1","updated":"2023-08-30T06:28:42Z","published":"2023-08-30T06:28:42Z","title":"Split Without a Leak: Reducing Privacy Leakage in Split Learning","summary":" The popularity of Deep Learning (DL) makes the privacy of sensitive data more\nimperative than ever. As a result, various privacy-preserving techniques have\nbeen implemented to preserve user data privacy in DL. Among various\nprivacy-preserving techniques, collaborative learning techniques, such as Split\nLearning (SL) have been utilized to accelerate the learning and prediction\nprocess. Initially, SL was considered a promising approach to data privacy.\nHowever, subsequent research has demonstrated that SL is susceptible to many\ntypes of attacks and, therefore, it cannot serve as a privacy-preserving\ntechnique. Meanwhile, countermeasures using a combination of SL and encryption\nhave also been introduced to achieve privacy-preserving deep learning. In this\nwork, we propose a hybrid approach using SL and Homomorphic Encryption (HE).\nThe idea behind it is that the client encrypts the activation map (the output\nof the split layer between the client and the server) before sending it to the\nserver. Hence, during both forward and backward propagation, the server cannot\nreconstruct the client's input data from the intermediate activation map. This\nimprovement is important as it reduces privacy leakage compared to other\nSL-based works, where the server can gain valuable information about the\nclient's input. In addition, on the MIT-BIH dataset, our proposed hybrid\napproach using SL and HE yields faster training time (about 6 times) and\nsignificantly reduced communication overhead (almost 160 times) compared to\nother HE-based approaches, thereby offering improved privacy protection for\nsensitive data in DL.\n","authors":["Khoa Nguyen","Tanveer Khan","Antonis Michalas"],"pdf_url":"https://arxiv.org/pdf/2308.15783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.01710v4","updated":"2023-08-30T05:08:47Z","published":"2020-11-03T13:54:01Z","title":"BCGGAN: Ballistocardiogram artifact removal in simultaneous EEG-fMRI\n using generative adversarial network","summary":" Due to its advantages of high temporal and spatial resolution, the technology\nof simultaneous electroencephalogram-functional magnetic resonance imaging\n(EEG-fMRI) acquisition and analysis has attracted much attention, and has been\nwidely used in various research fields of brain science. However, during the\nfMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate\nthe EEG. As an unpaired problem, BCG artifact removal now remains a\nconsiderable challenge. Aiming to provide a solution, this paper proposed a\nnovel modular generative adversarial network (GAN) and corresponding training\nstrategy to improve the network performance by optimizing the parameters of\neach module. In this manner, we hope to improve the local representation\nability of the network model, thereby improving its overall performance and\nobtaining a reliable generator for BCG artifact removal. Moreover, the proposed\nmethod does not rely on additional reference signal or complex hardware\nequipment. Experimental results show that, compared with multiple methods, the\ntechnique presented in this paper can remove the BCG artifact more effectively\nwhile retaining essential EEG information.\n","authors":["Guang Lin","Jianhai Zhang","Yuxi Liu","Tianyang Gao","Wanzeng Kong","Xu Lei","Tao Qiu"],"pdf_url":"https://arxiv.org/pdf/2011.01710v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04027v3","updated":"2023-08-30T04:55:04Z","published":"2023-04-08T14:40:35Z","title":"Estimating 3D Dental Structures using Simulated Panoramic Radiographs\n and Neural Ray Tracing","summary":" Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality\nfor dental examination. However, PX only provides a flattened 2D image, lacking\nin a 3D view of the oral structure. In this paper, we propose a framework to\nestimate 3D oral structures from real-world PX. Our framework tackles full 3D\nreconstruction for varying subjects (patients) where each reconstruction is\nbased only on a single panoramic image. We create an intermediate\nrepresentation called simulated PX (SimPX) from 3D Cone-beam computed\ntomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and\nrotational principles of PX imaging. SimPX aims at not only truthfully\nsimulating PX, but also facilitates the reverting process back to 3D data. We\npropose a novel neural model based on ray tracing which exploits both global\nand local input features to convert SimPX to 3D output. At inference, a real PX\nimage is translated to a SimPX-style image with semantic regularization, and\nthe translated image is processed by generation module to produce high-quality\noutputs. Experiments show that our method outperforms prior state-of-the-art in\nreconstruction tasks both quantitatively and qualitatively. Unlike prior\nmethods, Our method does not require any prior information such as the shape of\ndental arches, nor the matched PX-CBCT dataset for training, which is difficult\nto obtain in clinical practice.\n","authors":["Sihwa Park","Seongjun Kim","Doeyoung Kwon","Yohan Jang","In-Seok Song","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2304.04027v3.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2306.01762v2","updated":"2023-08-30T04:53:15Z","published":"2023-05-27T06:00:51Z","title":"Pre-trained transformer for adversarial purification","summary":" With more and more deep neural networks being deployed as various daily\nservices, their reliability is essential. It's frightening that deep neural\nnetworks are vulnerable and sensitive to adversarial attacks, the most common\none of which for the services is evasion-based. Recent works usually strengthen\nthe robustness by adversarial training or leveraging the knowledge of an amount\nof clean data. However, in practical terms, retraining and redeploying the\nmodel need a large computational budget, leading to heavy losses to the online\nservice. In addition, when adversarial examples of a certain attack are\ndetected, only limited adversarial examples are available for the service\nprovider, while much clean data may not be accessible. Given the mentioned\nproblems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is\nto rapidly defend against a certain attack for the frozen original service\nmodel with limitations of few clean and adversarial examples. Motivated by the\ngeneralization and the universal computation ability of pre-trained transformer\nmodels, we come up with a new defender method, CeTaD, which stands for\nConsidering Pre-trained Transformers as Defenders. In particular, we evaluate\nthe effectiveness and the transferability of CeTaD in the case of one-shot\nadversarial examples and explore the impact of different parts of CeTaD as well\nas training data conditions. CeTaD is flexible, able to be embedded into an\narbitrary differentiable model, and suitable for various types of attacks.\n","authors":["Kai Wu","Yujian Betterest Li","Xiaoyu Zhang","Handing Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.01762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10235v4","updated":"2023-08-30T04:32:36Z","published":"2023-05-15T15:44:51Z","title":"Assessing Hidden Risks of LLMs: An Empirical Study on Robustness,\n Consistency, and Credibility","summary":" The recent popularity of large language models (LLMs) has brought a\nsignificant impact to boundless fields, particularly through their open-ended\necosystem such as the APIs, open-sourced models, and plugins. However, with\ntheir widespread deployment, there is a general lack of research that\nthoroughly discusses and analyzes the potential risks concealed. In that case,\nwe intend to conduct a preliminary but pioneering study covering the\nrobustness, consistency, and credibility of LLMs systems. With most of the\nrelated literature in the era of LLM uncharted, we propose an automated\nworkflow that copes with an upscaled number of queries/responses. Overall, we\nconduct over a million queries to the mainstream LLMs including ChatGPT, LLaMA,\nand OPT. Core to our workflow consists of a data primitive, followed by an\nautomated interpreter that evaluates these LLMs under different adversarial\nmetrical systems. As a result, we draw several, and perhaps unfortunate,\nconclusions that are quite uncommon from this trendy community. Briefly, they\nare: (i)-the minor but inevitable error occurrence in the user-generated query\ninput may, by chance, cause the LLM to respond unexpectedly; (ii)-LLMs possess\npoor consistency when processing semantically similar query input. In addition,\nas a side finding, we find that ChatGPT is still capable to yield the correct\nanswer even when the input is polluted at an extreme level. While this\nphenomenon demonstrates the powerful memorization of the LLMs, it raises\nserious concerns about using such data for LLM-involved evaluation in academic\ndevelopment. To deal with it, we propose a novel index associated with a\ndataset that roughly decides the feasibility of using such data for\nLLM-involved evaluation. Extensive empirical studies are tagged to support the\naforementioned claims.\n","authors":["Wentao Ye","Mingfeng Ou","Tianyi Li","Yipeng chen","Xuetao Ma","Yifan Yanggong","Sai Wu","Jie Fu","Gang Chen","Haobo Wang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.10235v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03188v2","updated":"2023-08-30T03:47:34Z","published":"2023-08-06T18:38:52Z","title":"Automatically Correcting Large Language Models: Surveying the landscape\n of diverse self-correction strategies","summary":" Large language models (LLMs) have demonstrated remarkable performance across\na wide array of NLP tasks. However, their efficacy is undermined by undesired\nand inconsistent behaviors, including hallucination, unfaithful reasoning, and\ntoxic content. A promising approach to rectify these flaws is self-correction,\nwhere the LLM itself is prompted or guided to fix problems in its own output.\nTechniques leveraging automated feedback -- either produced by the LLM itself\nor some external system -- are of particular interest as they are a promising\nway to make LLM-based solutions more practical and deployable with minimal\nhuman feedback. This paper presents a comprehensive review of this emerging\nclass of techniques. We analyze and taxonomize a wide array of recent work\nutilizing these strategies, including training-time, generation-time, and\npost-hoc correction. We also summarize the major applications of this strategy\nand conclude by discussing future directions and challenges.\n","authors":["Liangming Pan","Michael Saxon","Wenda Xu","Deepak Nathani","Xinyi Wang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03188v2.pdf","comment":"Work in Progress. Version 2"},{"id":"http://arxiv.org/abs/2308.15734v1","updated":"2023-08-30T03:21:45Z","published":"2023-08-30T03:21:45Z","title":"Efficient and Explainable Graph Neural Architecture Search via\n Monte-Carlo Tree Search","summary":" Graph neural networks (GNNs) are powerful tools for performing data science\ntasks in various domains. Although we use GNNs in wide application scenarios,\nit is a laborious task for researchers and practitioners to design/select\noptimal GNN rchitectures in diverse graphs. To save human efforts and\ncomputational costs, graph neural architecture search (Graph NAS) has been used\nto search for a sub-optimal GNN architecture that combines existing components.\nHowever, there are no existing Graph NAS methods that satisfy explainability,\nefficiency, and adaptability to various graphs. Therefore, we propose an\nefficient and explainable Graph NAS method, called ExGNAS, which consists of\n(i) a simple search space that can adapt to various graphs and (ii) a search\nalgorithm that makes the decision process explainable. The search space\nincludes only fundamental functions that can handle homophilic and heterophilic\ngraphs. The search algorithm efficiently searches for the best GNN architecture\nvia Monte-Carlo tree search without neural models. The combination of our\nsearch space and algorithm achieves finding accurate GNN models and the\nimportant functions within the search space. We comprehensively evaluate our\nmethod compared with twelve hand-crafted GNN architectures and three Graph NAS\nmethods in four graphs. Our experimental results show that ExGNAS increases AUC\nup to 3.6 and reduces run time up to 78\\% compared with the state-of-the-art\nGraph NAS methods. Furthermore, we show ExGNAS is effective in analyzing the\ndifference between GNN architectures in homophilic and heterophilic graphs.\n","authors":["Yuya Sasaki"],"pdf_url":"https://arxiv.org/pdf/2308.15734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15730v1","updated":"2023-08-30T03:14:02Z","published":"2023-08-30T03:14:02Z","title":"Fully Embedded Time-Series Generative Adversarial Networks","summary":" Generative Adversarial Networks (GANs) should produce synthetic data that\nfits the underlying distribution of the data being modeled. For real valued\ntime-series data, this implies the need to simultaneously capture the static\ndistribution of the data, but also the full temporal distribution of the data\nfor any potential time horizon. This temporal element produces a more complex\nproblem that can potentially leave current solutions under-constrained,\nunstable during training, or prone to varying degrees of mode collapse. In\nFETSGAN, entire sequences are translated directly to the generator's sampling\nspace using a seq2seq style adversarial auto encoder (AAE), where adversarial\ntraining is used to match the training distribution in both the feature space\nand the lower dimensional sampling space. This additional constraint provides a\nloose assurance that the temporal distribution of the synthetic samples will\nnot collapse. In addition, the First Above Threshold (FAT) operator is\nintroduced to supplement the reconstruction of encoded sequences, which\nimproves training stability and the overall quality of the synthetic data being\ngenerated. These novel contributions demonstrate a significant improvement to\nthe current state of the art for adversarial learners in qualitative measures\nof temporal similarity and quantitative predictive ability of data generated\nthrough FETSGAN.\n","authors":["Joe Beck","Subhadeep Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2308.15730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07543v4","updated":"2023-08-30T03:12:34Z","published":"2023-03-14T00:13:57Z","title":"WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant\n Analysis","summary":" Deep neural networks are susceptible to generating overconfident yet\nerroneous predictions when presented with data beyond known concepts. This\nchallenge underscores the importance of detecting out-of-distribution (OOD)\nsamples in the open world. In this work, we propose a novel feature-space OOD\ndetection score based on class-specific and class-agnostic information.\nSpecifically, the approach utilizes Whitened Linear Discriminant Analysis to\nproject features into two subspaces - the discriminative and residual subspaces\n- for which the in-distribution (ID) classes are maximally separated and\nclosely clustered, respectively. The OOD score is then determined by combining\nthe deviation from the input data to the ID pattern in both subspaces. The\nefficacy of our method, named WDiscOOD, is verified on the large-scale\nImageNet-1k benchmark, with six OOD datasets that cover a variety of\ndistribution shifts. WDiscOOD demonstrates superior performance on deep\nclassifiers with diverse backbone architectures, including CNN and vision\ntransformer. Furthermore, we also show that WDiscOOD more effectively detects\nnovel concepts in representation spaces trained with contrastive objectives,\nincluding supervised contrastive loss and multi-modality contrastive loss.\n","authors":["Yiye Chen","Yunzhi Lin","Ruinian Xu","Patricio A. Vela"],"pdf_url":"https://arxiv.org/pdf/2303.07543v4.pdf","comment":"Accepted by ICCV 2023. Code is available at:\n https://github.com/ivalab/WDiscOOD.git"},{"id":"http://arxiv.org/abs/2308.15720v1","updated":"2023-08-30T02:50:54Z","published":"2023-08-30T02:50:54Z","title":"Surrogate-based Autotuning for Randomized Sketching Algorithms in\n Regression Problems","summary":" Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be\neffective in handling high-dimensional computational problems, providing\nhigh-quality empirical performance as well as strong probabilistic guarantees.\nHowever, their practical application is complicated by the fact that the user\nneeds to set various algorithm-specific tuning parameters which are different\nthan those used in traditional NLA. This paper demonstrates how a\nsurrogate-based autotuning approach can be used to address fundamental problems\nof parameter selection in RandNLA algorithms. In particular, we provide a\ndetailed investigation of surrogate-based autotuning for\nsketch-and-precondition (SAP) based randomized least squares methods, which\nhave been one of the great success stories in modern RandNLA. Empirical results\nshow that our surrogate-based autotuning approach can achieve near-optimal\nperformance with much less tuning cost than a random search (up to about 4x\nfewer trials of different parameter configurations). Moreover, while our\nexperiments focus on least squares, our results demonstrate a general-purpose\nautotuning pipeline applicable to any kind of RandNLA algorithm.\n","authors":["Younghyun Cho","James W. Demmel","Michał Dereziński","Haoyun Li","Hengrui Luo","Michael W. Mahoney","Riley J. Murray"],"pdf_url":"https://arxiv.org/pdf/2308.15720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08108v2","updated":"2023-08-30T02:43:29Z","published":"2022-12-15T19:49:27Z","title":"Dataflow Analysis-Inspired Deep Learning for Efficient Vulnerability\n Detection","summary":" Deep learning-based vulnerability detection has shown great performance and,\nin some studies, outperformed static analysis tools. However, the\nhighest-performing approaches use token-based transformer models, which are not\nthe most efficient to capture code semantics required for vulnerability\ndetection. Classical program analysis techniques such as dataflow analysis can\ndetect many types of bugs based on their root causes. In this paper, we propose\nto combine such causal-based vulnerability detection algorithms with deep\nlearning, aiming to achieve more efficient and effective vulnerability\ndetection. Specifically, we designed DeepDFA, a dataflow analysis-inspired\ngraph learning framework and an embedding technique that enables graph learning\nto simulate dataflow computation. We show that DeepDFA is both performant and\nefficient. DeepDFA outperformed all non-transformer baselines. It was trained\nin 9 minutes, 75x faster than the highest-performing baseline model. When using\nonly 50+ vulnerable and several hundreds of total examples as training data,\nthe model retained the same performance as 100% of the dataset. DeepDFA also\ngeneralized to real-world vulnerabilities in DBGBench; it detected 8.7 out of\n17 vulnerabilities on average across folds and was able to distinguish between\npatched and buggy versions, while the highest-performing baseline models did\nnot detect any vulnerabilities. By combining DeepDFA with a large language\nmodel, we surpassed the state-of-the-art vulnerability detection performance on\nthe Big-Vul dataset with 96.46 F1 score, 97.82 precision, and 95.14 recall. Our\nreplication package is located at https://figshare.com/s/e7953b4d345b00990d17.\n","authors":["Benjamin Steenhoek","Hongyang Gao","Wei Le"],"pdf_url":"https://arxiv.org/pdf/2212.08108v2.pdf","comment":"11 pages, 9 figures. Accepted as a conference paper at ICSE 2024"},{"id":"http://arxiv.org/abs/2308.15712v1","updated":"2023-08-30T02:24:09Z","published":"2023-08-30T02:24:09Z","title":"Exploring Deep Learning for Full-disk Solar Flare Prediction with\n Empirical Insights from Guided Grad-CAM Explanations","summary":" This study progresses solar flare prediction research by presenting a\nfull-disk deep-learning model to forecast $\\geq$M-class solar flares and\nevaluating its efficacy on both central (within $\\pm$70$^\\circ$) and near-limb\n(beyond $\\pm$70$^\\circ$) events, showcasing qualitative assessment of post hoc\nexplanations for the model's predictions, and providing empirical findings from\nhuman-centered quantitative assessments of these explanations. Our model is\ntrained using hourly full-disk line-of-sight magnetogram images to predict\n$\\geq$M-class solar flares within the subsequent 24-hour prediction window.\nAdditionally, we apply the Guided Gradient-weighted Class Activation Mapping\n(Guided Grad-CAM) attribution method to interpret our model's predictions and\nevaluate the explanations. Our analysis unveils that full-disk solar flare\npredictions correspond with active region characteristics. The following points\nrepresent the most important findings of our study: (1) Our deep learning\nmodels achieved an average true skill statistic (TSS) of $\\sim$0.51 and a\nHeidke skill score (HSS) of $\\sim$0.38, exhibiting skill to predict solar\nflares where for central locations the average recall is $\\sim$0.75 (recall\nvalues for X- and M-class are 0.95 and 0.73 respectively) and for the near-limb\nflares the average recall is $\\sim$0.52 (recall values for X- and M-class are\n0.74 and 0.50 respectively); (2) qualitative examination of the model's\nexplanations reveals that it discerns and leverages features linked to active\nregions in both central and near-limb locations within full-disk magnetograms\nto produce respective predictions. In essence, our models grasp the shape and\ntexture-based properties of flaring active regions, even in proximity to limb\nareas -- a novel and essential capability with considerable significance for\noperational forecasting systems.\n","authors":["Chetraj Pandey","Anli Ji","Trisha Nandakumar","Rafal A. Angryk","Berkay Aydin"],"pdf_url":"https://arxiv.org/pdf/2308.15712v1.pdf","comment":"This is a preprint accepted at the 10th IEEE International Conference\n On Data Science And Advanced Analytics (DSAA 2023). The conference\n proceedings will be published by the IEEE Xplore Digital Library with ISBN:\n 979-8-3503-4503-2. 10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.15710v1","updated":"2023-08-30T02:14:49Z","published":"2023-08-30T02:14:49Z","title":"Speech Wikimedia: A 77 Language Multilingual Speech Dataset","summary":" The Speech Wikimedia Dataset is a publicly available compilation of audio\nwith transcriptions extracted from Wikimedia Commons. It includes 1780 hours\n(195 GB) of CC-BY-SA licensed transcribed speech from a diverse set of\nscenarios and speakers, in 77 different languages. Each audio file has one or\nmore transcriptions in different languages, making this dataset suitable for\ntraining speech recognition, speech translation, and machine translation\nmodels.\n","authors":["Rafael Mosquera Gómez","Julián Eusse","Juan Ciro","Daniel Galvez","Ryan Hileman","Kurt Bollacker","David Kanter"],"pdf_url":"https://arxiv.org/pdf/2308.15710v1.pdf","comment":"Data-Centric Machine Learning Workshop at the International Machine\n Learning Conference 2023 (ICML)"},{"id":"http://arxiv.org/abs/2308.15709v1","updated":"2023-08-30T02:12:00Z","published":"2023-08-30T02:12:00Z","title":"Threshold KNN-Shapley: A Linear-Time and Privacy-Friendly Approach to\n Data Valuation","summary":" Data valuation, a critical aspect of data-centric ML research, aims to\nquantify the usefulness of individual data sources in training machine learning\n(ML) models. However, data valuation faces significant yet frequently\noverlooked privacy challenges despite its importance. This paper studies these\nchallenges with a focus on KNN-Shapley, one of the most practical data\nvaluation methods nowadays. We first emphasize the inherent privacy risks of\nKNN-Shapley, and demonstrate the significant technical difficulties in adapting\nKNN-Shapley to accommodate differential privacy (DP). To overcome these\nchallenges, we introduce TKNN-Shapley, a refined variant of KNN-Shapley that is\nprivacy-friendly, allowing for straightforward modifications to incorporate DP\nguarantee (DP-TKNN-Shapley). We show that DP-TKNN-Shapley has several\nadvantages and offers a superior privacy-utility tradeoff compared to naively\nprivatized KNN-Shapley in discerning data quality. Moreover, even non-private\nTKNN-Shapley achieves comparable performance as KNN-Shapley. Overall, our\nfindings suggest that TKNN-Shapley is a promising alternative to KNN-Shapley,\nparticularly for real-world applications involving sensitive data.\n","authors":["Jiachen T. Wang","Yuqing Zhu","Yu-Xiang Wang","Ruoxi Jia","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2308.15709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11197v2","updated":"2023-08-30T02:07:28Z","published":"2023-08-22T05:14:42Z","title":"Toward Generalizable Machine Learning Models in Speech, Language, and\n Hearing Sciences: Sample Size Estimation and Reducing Overfitting","summary":" This study's first purpose is to provide quantitative evidence that would\nincentivize researchers to instead use the more robust method of nested\ncross-validation. The second purpose is to present methods and MATLAB codes for\ndoing power analysis for ML-based analysis during the design of a study. Monte\nCarlo simulations were used to quantify the interactions between the employed\ncross-validation method, the discriminative power of features, the\ndimensionality of the feature space, and the dimensionality of the model. Four\ndifferent cross-validations (single holdout, 10-fold, train-validation-test,\nand nested 10-fold) were compared based on the statistical power and\nstatistical confidence of the ML models. Distributions of the null and\nalternative hypotheses were used to determine the minimum required sample size\nfor obtaining a statistically significant outcome ({\\alpha}=0.05,\n1-\\b{eta}=0.8). Statistical confidence of the model was defined as the\nprobability of correct features being selected and hence being included in the\nfinal model. Our analysis showed that the model generated based on the single\nholdout method had very low statistical power and statistical confidence and\nthat it significantly overestimated the accuracy. Conversely, the nested\n10-fold cross-validation resulted in the highest statistical confidence and the\nhighest statistical power, while providing an unbiased estimate of the\naccuracy. The required sample size with a single holdout could be 50% higher\nthan what would be needed if nested cross-validation were used. Confidence in\nthe model based on nested cross-validation was as much as four times higher\nthan the confidence in the single holdout-based model. A computational model,\nMATLAB codes, and lookup tables are provided to assist researchers with\nestimating the sample size during the design of their future studies.\n","authors":["Hamzeh Ghasemzadeh","Robert E. Hillman","Daryush D. Mehta"],"pdf_url":"https://arxiv.org/pdf/2308.11197v2.pdf","comment":"Under review at JSLHR"},{"id":"http://arxiv.org/abs/2308.15704v1","updated":"2023-08-30T01:59:42Z","published":"2023-08-30T01:59:42Z","title":"Towards a Rigorous Analysis of Mutual Information in Contrastive\n Learning","summary":" Contrastive learning has emerged as a cornerstone in recent achievements of\nunsupervised representation learning. Its primary paradigm involves an instance\ndiscrimination task with a mutual information loss. The loss is known as\nInfoNCE and it has yielded vital insights into contrastive learning through the\nlens of mutual information analysis. However, the estimation of mutual\ninformation can prove challenging, creating a gap between the elegance of its\nmathematical foundation and the complexity of its estimation. As a result,\ndrawing rigorous insights or conclusions from mutual information analysis\nbecomes intricate. In this study, we introduce three novel methods and a few\nrelated theorems, aimed at enhancing the rigor of mutual information analysis.\nDespite their simplicity, these methods can carry substantial utility.\nLeveraging these approaches, we reassess three instances of contrastive\nlearning analysis, illustrating their capacity to facilitate deeper\ncomprehension or to rectify pre-existing misconceptions. Specifically, we\ninvestigate small batch size, mutual information as a measure, and the InfoMin\nprinciple.\n","authors":["Kyungeun Lee","Jaeill Kim","Suhyun Kang","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2308.15704v1.pdf","comment":"18 pages, 7 figures, Under review"},{"id":"http://arxiv.org/abs/2308.15703v1","updated":"2023-08-30T01:56:57Z","published":"2023-08-30T01:56:57Z","title":"Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling\n Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate\n Prediction","summary":" Spatial-temporal information has been proven to be of great significance for\nclick-through rate prediction tasks in online Location-Based Services (LBS),\nespecially in mainstream food ordering platforms such as DoorDash, Uber Eats,\nMeituan, and Ele.me. Modeling user spatial-temporal preferences with sequential\nbehavior data has become a hot topic in recommendation systems and online\nadvertising. However, most of existing methods either lack the representation\nof rich spatial-temporal information or only handle user behaviors with limited\nlength, e.g. 100. In this paper, we tackle these problems by designing a new\nspatial-temporal modeling paradigm named Fragment and Integrate Network (FIN).\nFIN consists of two networks: (i) Fragment Network (FN) extracts Multiple\nSub-Sequences (MSS) from lifelong sequential behavior data, and captures the\nspecific spatial-temporal representation by modeling each MSS respectively.\nHere both a simplified attention and a complicated attention are adopted to\nbalance the performance gain and resource consumption. (ii) Integrate Network\n(IN) builds a new integrated sequence by utilizing spatial-temporal interaction\non MSS and captures the comprehensive spatial-temporal representation by\nmodeling the integrated sequence with a complicated attention. Both public\ndatasets and production datasets have demonstrated the accuracy and scalability\nof FIN. Since 2022, FIN has been fully deployed in the recommendation\nadvertising system of Ele.me, one of the most popular online food ordering\nplatforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and\n7.3% increase on Revenue Per Mille (RPM).\n","authors":["Jun Li","Jingjian Wang","Hongwei Wang","Xing Deng","Jielong Chen","Bing Cao","Zekun Wang","Guanjie Xu","Ge Zhang","Feng Shi","Hualei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15703v1.pdf","comment":"Accepted by CIKM 2023 Applied Research Paper"},{"id":"http://arxiv.org/abs/2308.15700v1","updated":"2023-08-30T01:54:31Z","published":"2023-08-30T01:54:31Z","title":"Training Towards Critical Use: Learning to Situate AI Predictions\n Relative to Human Knowledge","summary":" A growing body of research has explored how to support humans in making\nbetter use of AI-based decision support, including via training and onboarding.\nExisting research has focused on decision-making tasks where it is possible to\nevaluate \"appropriate reliance\" by comparing each decision against a ground\ntruth label that cleanly maps to both the AI's predictive target and the human\ndecision-maker's goals. However, this assumption does not hold in many\nreal-world settings where AI tools are deployed today (e.g., social work,\ncriminal justice, and healthcare). In this paper, we introduce a\nprocess-oriented notion of appropriate reliance called critical use that\ncenters the human's ability to situate AI predictions against knowledge that is\nuniquely available to them but unavailable to the AI model. To explore how\ntraining can support critical use, we conduct a randomized online experiment in\na complex social decision-making setting: child maltreatment screening. We find\nthat, by providing participants with accelerated, low-stakes opportunities to\npractice AI-assisted decision-making in this setting, novices came to exhibit\npatterns of disagreement with AI that resemble those of experienced workers. A\nqualitative examination of participants' explanations for their AI-assisted\ndecisions revealed that they drew upon qualitative case narratives, to which\nthe AI model did not have access, to learn when (not) to rely on AI\npredictions. Our findings open new questions for the study and design of\ntraining for real-world AI-assisted decision-making.\n","authors":["Anna Kawakami","Luke Guerdan","Yanghuidi Cheng","Matthew Lee","Scott Carter","Nikos Arechiga","Kate Glazko","Haiyi Zhu","Kenneth Holstein"],"pdf_url":"https://arxiv.org/pdf/2308.15700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14945v2","updated":"2023-08-30T01:48:21Z","published":"2023-08-28T23:51:33Z","title":"Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals","summary":" We consider the problem of sampling from a distribution governed by a\npotential function. This work proposes an explicit score-based MCMC method that\nis deterministic, resulting in a deterministic evolution for particles rather\nthan a stochastic differential equation evolution. The score term is given in\nclosed form by a regularized Wasserstein proximal, using a kernel convolution\nthat is approximated by sampling. We demonstrate fast convergence on various\nproblems and show improved dimensional dependence of mixing time bounds for the\ncase of Gaussian distributions compared to the unadjusted Langevin algorithm\n(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally\nderive closed form expressions for the distributions at each iterate for\nquadratic potential functions, characterizing the variance reduction. Empirical\nresults demonstrate that the particles behave in an organized manner, lying on\nlevel set contours of the potential. Moreover, the posterior mean estimator of\nthe proposed method is shown to be closer to the maximum a-posteriori estimator\ncompared to ULA and MALA, in the context of Bayesian logistic regression.\n","authors":["Hong Ye Tan","Stanley Osher","Wuchen Li"],"pdf_url":"https://arxiv.org/pdf/2308.14945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04150v4","updated":"2023-08-30T01:47:53Z","published":"2023-03-07T01:38:42Z","title":"Evolutionary Reinforcement Learning: A Survey","summary":" Reinforcement learning (RL) is a machine learning approach that trains agents\nto maximize cumulative rewards through interactions with environments. The\nintegration of RL with deep learning has recently resulted in impressive\nachievements in a wide range of challenging tasks, including board games,\narcade games, and robot control. Despite these successes, there remain several\ncrucial challenges, including brittle convergence properties caused by\nsensitive hyperparameters, difficulties in temporal credit assignment with long\ntime horizons and sparse rewards, a lack of diverse exploration, especially in\ncontinuous search space scenarios, difficulties in credit assignment in\nmulti-agent reinforcement learning, and conflicting objectives for rewards.\nEvolutionary computation (EC), which maintains a population of learning agents,\nhas demonstrated promising performance in addressing these limitations. This\narticle presents a comprehensive survey of state-of-the-art methods for\nintegrating EC into RL, referred to as evolutionary reinforcement learning\n(EvoRL). We categorize EvoRL methods according to key research fields in RL,\nincluding hyperparameter optimization, policy search, exploration, reward\nshaping, meta-RL, and multi-objective RL. We then discuss future research\ndirections in terms of efficient methods, benchmarks, and scalable platforms.\nThis survey serves as a resource for researchers and practitioners interested\nin the field of EvoRL, highlighting the important challenges and opportunities\nfor future research. With the help of this survey, researchers and\npractitioners can develop more efficient methods and tailored benchmarks for\nEvoRL, further advancing this promising cross-disciplinary research field.\n","authors":["Hui Bai","Ran Cheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2303.04150v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15697v1","updated":"2023-08-30T01:40:38Z","published":"2023-08-30T01:40:38Z","title":"Segmenting mechanically heterogeneous domains via unsupervised learning","summary":" From biological organs to soft robotics, highly deformable materials are\nessential components of natural and engineered systems. These highly deformable\nmaterials can have heterogeneous material properties, and can experience\nheterogeneous deformations with or without underlying material heterogeneity.\nMany recent works have established that computational modeling approaches are\nwell suited for understanding and predicting the consequences of material\nheterogeneity and for interpreting observed heterogeneous strain fields. In\nparticular, there has been significant work towards developing inverse analysis\napproaches that can convert observed kinematic quantities (e.g., displacement,\nstrain) to material properties and mechanical state. Despite the success of\nthese approaches, they are not necessarily generalizable and often rely on\ntight control and knowledge of boundary conditions. Here, we will build on the\nrecent advances (and ubiquity) of machine learning approaches to explore\nalternative approaches to detect patterns in heterogeneous material properties\nand mechanical behavior. Specifically, we will explore unsupervised learning\napproaches to clustering and ensemble clutering to identify heterogeneous\nregions. Overall, we find that these approaches are effective, yet limited in\ntheir abilities. Through this initial exploration (where all data and code is\npublished alongside this manuscript), we set the stage for future studies that\nmore specifically adapt these methods to mechanical data.\n","authors":["Quan Nguyen","Emma Lejeune"],"pdf_url":"https://arxiv.org/pdf/2308.15697v1.pdf","comment":"26 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.06767v3","updated":"2023-08-30T01:25:29Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v3.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.15690v1","updated":"2023-08-30T01:14:32Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v1.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"},{"id":"http://arxiv.org/abs/2001.10474v3","updated":"2023-08-30T00:10:47Z","published":"2020-01-28T17:31:23Z","title":"Coagent Networks Revisited","summary":" Coagent networks formalize the concept of arbitrary networks of stochastic\nagents that collaborate to take actions in a reinforcement learning\nenvironment. Prominent examples of coagent networks in action include\napproaches to hierarchical reinforcement learning (HRL), such as those using\noptions, which attempt to address the exploration exploitation trade-off by\nintroducing abstract actions at different levels by sequencing multiple\nstochastic networks within the HRL agents. We first provide a unifying\nperspective on the many diverse examples that fall under coagent networks. We\ndo so by formalizing the rules of execution in a coagent network, enabled by\nthe novel and intuitive idea of execution paths in a coagent network. Motivated\nby parameter sharing in the hierarchical option-critic architecture, we revisit\nthe coagent network theory and achieve a much shorter proof of the policy\ngradient theorem using our idea of execution paths, without any assumption on\nhow parameters are shared among coagents. We then generalize our setting and\nproof to include the scenario where coagents act asynchronously. This new\nperspective and theorem also lead to more mathematically accurate and\nperformant algorithms than those in the existing literature. Lastly, by running\nnonstationary RL experiments, we survey the performance and properties of\ndifferent generalizations of option-critic models.\n","authors":["Modjtaba Shokrian Zini","Mohammad Pedramfar","Matthew Riemer","Ahmadreza Moradipari","Miao Liu"],"pdf_url":"https://arxiv.org/pdf/2001.10474v3.pdf","comment":"Reformatted paper significantly and clarified results on the\n asynchronous case"},{"id":"http://arxiv.org/abs/2305.10455v3","updated":"2023-08-30T00:05:26Z","published":"2023-05-17T02:53:58Z","title":"Towards Generalist Robots: A Promising Paradigm via Generative\n Simulation","summary":" This document serves as a position paper that outlines the authors' vision\nfor a potential pathway towards generalist robots. The purpose of this document\nis to share the excitement of the authors with the community and highlight a\npromising research direction in robotics and AI. The authors believe the\nproposed paradigm is a feasible path towards accomplishing the long-standing\ngoal of robotics research: deploying robots, or embodied AI agents more\nbroadly, in various non-factory real-world settings to perform diverse tasks.\nThis document presents a specific idea for mining knowledge in the latest\nlarge-scale foundation models for robotics research. Instead of directly using\nor adapting these models to produce low-level policies and actions, it\nadvocates for a fully automated generative pipeline (termed as generative\nsimulation), which uses these models to generate diversified tasks, scenes and\ntraining supervisions at scale, thereby scaling up low-level skill learning and\nultimately leading to a foundation model for robotics that empowers generalist\nrobots. The authors are actively pursuing this direction, but in the meantime,\nthey recognize that the ambitious goal of building generalist robots with\nlarge-scale policy training demands significant resources such as computing\npower and hardware, and research groups in academia alone may face severe\nresource constraints in implementing the entire vision. Therefore, the authors\nbelieve sharing their thoughts at this early stage could foster discussions,\nattract interest towards the proposed pathway and related topics from industry\ngroups, and potentially spur significant technical advancements in the field.\n","authors":["Zhou Xian","Theophile Gervet","Zhenjia Xu","Yi-Ling Qiao","Tsun-Hsuan Wang","Yian Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10455v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15673v1","updated":"2023-08-30T00:03:03Z","published":"2023-08-30T00:03:03Z","title":"MDTD: A Multi Domain Trojan Detector for Deep Neural Networks","summary":" Machine learning models that use deep neural networks (DNNs) are vulnerable\nto backdoor attacks. An adversary carrying out a backdoor attack embeds a\npredefined perturbation called a trigger into a small subset of input samples\nand trains the DNN such that the presence of the trigger in the input results\nin an adversary-desired output class. Such adversarial retraining however needs\nto ensure that outputs for inputs without the trigger remain unaffected and\nprovide high classification accuracy on clean samples. In this paper, we\npropose MDTD, a Multi-Domain Trojan Detector for DNNs, which detects inputs\ncontaining a Trojan trigger at testing time. MDTD does not require knowledge of\ntrigger-embedding strategy of the attacker and can be applied to a pre-trained\nDNN model with image, audio, or graph-based inputs. MDTD leverages an insight\nthat input samples containing a Trojan trigger are located relatively farther\naway from a decision boundary than clean samples. MDTD estimates the distance\nto a decision boundary using adversarial learning methods and uses this\ndistance to infer whether a test-time input sample is Trojaned or not. We\nevaluate MDTD against state-of-the-art Trojan detection methods across five\nwidely used image-based datasets: CIFAR100, CIFAR10, GTSRB, SVHN, and\nFlowers102; four graph-based datasets: AIDS, WinMal, Toxicant, and COLLAB; and\nthe SpeechCommand audio dataset. MDTD effectively identifies samples that\ncontain different types of Trojan triggers. We evaluate MDTD against adaptive\nattacks where an adversary trains a robust DNN to increase (decrease) distance\nof benign (Trojan) inputs from a decision boundary.\n","authors":["Arezoo Rajabi","Surudhi Asokraj","Fengqing Jiang","Luyao Niu","Bhaskar Ramasubramanian","Jim Ritcey","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2308.15673v1.pdf","comment":"Accepted to ACM Conference on Computer and Communications Security\n (ACM CCS) 2023"},{"id":"http://arxiv.org/abs/2308.16362v1","updated":"2023-08-30T23:34:11Z","published":"2023-08-30T23:34:11Z","title":"A Unified Analysis for the Subgradient Methods Minimizing Composite\n Nonconvex, Nonsmooth and Non-Lipschitz Functions","summary":" In this paper we propose a proximal subgradient method (Prox-SubGrad) for\nsolving nonconvex and nonsmooth optimization problems without assuming\nLipschitz continuity conditions. A number of subgradient upper bounds and their\nrelationships are presented. By means of these upper bounding conditions, we\nestablish some uniform recursive relations for the Moreau envelopes for weakly\nconvex optimization. This uniform scheme simplifies and unifies the proof\nschemes to establish rate of convergence for Prox-SubGrad without assuming\nLipschitz continuity. We present a novel convergence analysis in this context.\nFurthermore, we propose some new stochastic subgradient upper bounding\nconditions and establish convergence and iteration complexity rates for the\nstochastic subgradient method (Sto-SubGrad) to solve non-Lipschitz and\nnonsmooth stochastic optimization problems. In particular, for both\ndeterministic and stochastic subgradient methods on weakly convex optimization\nproblems without Lipschitz continuity, under any of the subgradient upper\nbounding conditions to be introduced in the paper, we show that $O(1/\\sqrt{T})$\nconvergence rate holds in terms of the square of gradient of the Moreau\nenvelope function, which further improves to be $O(1/{T})$ if, in addition, the\nuniform KL condition with exponent $1/2$ holds.\n","authors":["Daoli Zhu","Lei Zhao","Shuzhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16360v1","updated":"2023-08-30T23:26:33Z","published":"2023-08-30T23:26:33Z","title":"Emoji Promotes Developer Participation and Issue Resolution on GitHub","summary":" Although remote working is increasingly adopted during the pandemic, many are\nconcerned by the low-efficiency in the remote working. Missing in text-based\ncommunication are non-verbal cues such as facial expressions and body language,\nwhich hinders the effective communication and negatively impacts the work\noutcomes. Prevalent on social media platforms, emojis, as alternative\nnon-verbal cues, are gaining popularity in the virtual workspaces well. In this\npaper, we study how emoji usage influences developer participation and issue\nresolution in virtual workspaces. To this end, we collect GitHub issues for a\none-year period and apply causal inference techniques to measure the causal\neffect of emojis on the outcome of issues, controlling for confounders such as\nissue content, repository, and author information. We find that emojis can\nsignificantly reduce the resolution time of issues and attract more user\nparticipation. We also compare the heterogeneous effect on different types of\nissues. These findings deepen our understanding of the developer communities,\nand they provide design implications on how to facilitate interactions and\nbroaden developer participation.\n","authors":["Yuhang Zhou","Xuan Lu","Ge Gao","Qiaozhu Mei","Wei Ai"],"pdf_url":"https://arxiv.org/pdf/2308.16360v1.pdf","comment":"12 pages, 5 figures. To be published in the 18th International AAAI\n Conference on Web and Social Media (ICWSM 2024)"},{"id":"http://arxiv.org/abs/2306.01890v2","updated":"2023-08-30T23:17:53Z","published":"2023-06-02T19:51:48Z","title":"Mixed-type Distance Shrinkage and Selection for Clustering via Kernel\n Metric Learning","summary":" Distance-based clustering and classification are widely used in various\nfields to group mixed numeric and categorical data. In many algorithms, a\npredefined distance measurement is used to cluster data points based on their\ndissimilarity. While there exist numerous distance-based measures for data with\npure numerical attributes and several ordered and unordered categorical\nmetrics, an efficient and accurate distance for mixed-type data that utilizes\nthe continuous and discrete properties simulatenously is an open problem. Many\nmetrics convert numerical attributes to categorical ones or vice versa. They\nhandle the data points as a single attribute type or calculate a distance\nbetween each attribute separately and add them up. We propose a metric called\nKDSUM that uses mixed kernels to measure dissimilarity, with cross-validated\noptimal bandwidth selection. We demonstrate that KDSUM is a shrinkage method\nfrom existing mixed-type metrics to a uniform dissimilarity metric, and\nimproves clustering accuracy when utilized in existing distance-based\nclustering algorithms on simulated and real-world datasets containing\ncontinuous-only, categorical-only, and mixed-type data.\n","authors":["Jesse S. Ghashti","John R. J. Thompson"],"pdf_url":"https://arxiv.org/pdf/2306.01890v2.pdf","comment":"38 pages, 3 tables, 8 figures"},{"id":"http://arxiv.org/abs/2302.12431v2","updated":"2023-08-30T23:13:13Z","published":"2023-02-24T03:18:45Z","title":"Flexible Phase Dynamics for Bio-Plausible Contrastive Learning","summary":" Many learning algorithms used as normative models in neuroscience or as\ncandidate approaches for learning on neuromorphic chips learn by contrasting\none set of network states with another. These Contrastive Learning (CL)\nalgorithms are traditionally implemented with rigid, temporally non-local, and\nperiodic learning dynamics that could limit the range of physical systems\ncapable of harnessing CL. In this study, we build on recent work exploring how\nCL might be implemented by biological or neurmorphic systems and show that this\nform of learning can be made temporally local, and can still function even if\nmany of the dynamical requirements of standard training procedures are relaxed.\nThanks to a set of general theorems corroborated by numerical experiments\nacross several CL models, our results provide theoretical foundations for the\nstudy and development of CL methods for biological and neuromorphic neural\nnetworks.\n","authors":["Ezekiel Williams","Colin Bredenberg","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2302.12431v2.pdf","comment":"23 pages, 4 figures. Paper accepted to ICML and update includes\n changes made based on reviewer feedback"},{"id":"http://arxiv.org/abs/2209.02064v2","updated":"2023-08-30T22:48:02Z","published":"2022-09-05T17:18:43Z","title":"GRASP: A Goodness-of-Fit Test for Classification Learning","summary":" Performance of classifiers is often measured in terms of average accuracy on\ntest data. Despite being a standard measure, average accuracy fails in\ncharacterizing the fit of the model to the underlying conditional law of labels\ngiven the features vector ($Y|X$), e.g. due to model misspecification, over\nfitting, and high-dimensionality. In this paper, we consider the fundamental\nproblem of assessing the goodness-of-fit for a general binary classifier. Our\nframework does not make any parametric assumption on the conditional law $Y|X$,\nand treats that as a black box oracle model which can be accessed only through\nqueries. We formulate the goodness-of-fit assessment problem as a tolerance\nhypothesis testing of the form \\[ H_0: \\mathbb{E}\\Big[D_f\\Big({\\sf\nBern}(\\eta(X))\\|{\\sf Bern}(\\hat{\\eta}(X))\\Big)\\Big]\\leq \\tau\\,, \\] where $D_f$\nrepresents an $f$-divergence function, and $\\eta(x)$, $\\hat{\\eta}(x)$\nrespectively denote the true and an estimate likelihood for a feature vector\n$x$ admitting a positive label. We propose a novel test, called \\grasp for\ntesting $H_0$, which works in finite sample settings, no matter the features\n(distribution-free). We also propose model-X \\grasp designed for model-X\nsettings where the joint distribution of the features vector is known. Model-X\n\\grasp uses this distributional information to achieve better power. We\nevaluate the performance of our tests through extensive numerical experiments.\n","authors":["Adel Javanmard","Mohammad Mehrabi"],"pdf_url":"https://arxiv.org/pdf/2209.02064v2.pdf","comment":"54 pages, 4 tables and 5 figures"},{"id":"http://arxiv.org/abs/2303.09981v2","updated":"2023-08-30T22:44:43Z","published":"2023-03-17T13:58:06Z","title":"Inferring Traffic Models in Terminal Airspace from Flight Tracks and\n Procedures","summary":" Realistic aircraft trajectory models are useful in the design and validation\nof air traffic management (ATM) systems. Models of aircraft operated under\ninstrument flight rules (IFR) require capturing the variability inherent in how\naircraft follow standard flight procedures. The variability in aircraft\nbehavior varies among flight stages. In this paper, we propose a probabilistic\nmodel that can learn the variability from the procedural data and flight tracks\ncollected from radar surveillance data. For each segment, a Gaussian mixture\nmodel is used to learn the deviations of aircraft trajectories from their\nprocedures. Given new procedures, we can generate synthetic trajectories by\nsampling a series of deviations from the trained Gaussian distributions and\nreconstructing the aircraft trajectory using the deviations and the procedures.\nWe extend this method to capture pairwise correlations between aircraft and\nshow how a pairwise model can be used to generate traffic involving an\narbitrary number of aircraft. We demonstrate the proposed models on the arrival\ntracks and procedures of the John F. Kennedy International Airport. The\ndistributional similarity between the original and the synthetic trajectory\ndataset was evaluated using the Jensen-Shannon divergence between the empirical\ndistributions of different variables. We also provide qualitative analyses of\nthe synthetic trajectories generated from the models.\n","authors":["Soyeon Jung","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2303.09981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.05102v2","updated":"2023-08-30T21:58:45Z","published":"2022-10-11T02:39:06Z","title":"Pre-Training Representations of Binary Code Using Contrastive Learning","summary":" Compiled software is delivered as executable binary code. Developers write\nsource code to express the software semantics, but the compiler converts it to\na binary format that the CPU can directly execute. Therefore, binary code\nanalysis is critical to applications in reverse engineering and computer\nsecurity tasks where source code is not available. However, unlike source code\nand natural language that contain rich semantic information, binary code is\ntypically difficult for human engineers to understand and analyze. While\nexisting work uses AI models to assist source code analysis, few studies have\nconsidered binary code. In this paper, we propose a COntrastive learning Model\nfor Binary cOde Analysis, or COMBO, that incorporates source code and comment\ninformation into binary code during representation learning. Specifically, we\npresent three components in COMBO: (1) a primary contrastive learning method\nfor cold-start pre-training, (2) a simplex interpolation method to incorporate\nsource code, comments, and binary code, and (3) an intermediate representation\nlearning algorithm to provide binary code embeddings. Finally, we evaluate the\neffectiveness of the pre-trained representations produced by COMBO using three\nindicative downstream tasks relating to binary code: algorithmic functionality\nclassification, binary code similarity, and vulnerability detection. Our\nexperimental results show that COMBO facilitates representation learning of\nbinary code visualized by distribution analysis, and improves the performance\non all three downstream tasks by 5.45% on average compared to state-of-the-art\nlarge-scale language representation models. To the best of our knowledge, COMBO\nis the first language representation model that incorporates source code,\nbinary code, and comments into contrastive code representation learning and\nunifies multiple tasks for binary code analysis.\n","authors":["Yifan Zhang","Chen Huang","Yueke Zhang","Kevin Cao","Scott Thomas Andersen","Huajie Shao","Kevin Leach","Yu Huang"],"pdf_url":"https://arxiv.org/pdf/2210.05102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16336v1","updated":"2023-08-30T21:56:36Z","published":"2023-08-30T21:56:36Z","title":"ToddlerBERTa: Exploiting BabyBERTa for Grammar Learning and Language\n Understanding","summary":" We present ToddlerBERTa, a BabyBERTa-like language model, exploring its\ncapabilities through five different models with varied hyperparameters.\nEvaluating on BLiMP, SuperGLUE, MSGS, and a Supplement benchmark from the\nBabyLM challenge, we find that smaller models can excel in specific tasks,\nwhile larger models perform well with substantial data. Despite training on a\nsmaller dataset, ToddlerBERTa demonstrates commendable performance, rivalling\nthe state-of-the-art RoBERTa-base. The model showcases robust language\nunderstanding, even with single-sentence pretraining, and competes with\nbaselines that leverage broader contextual information. Our work provides\ninsights into hyperparameter choices, and data utilization, contributing to the\nadvancement of language models.\n","authors":["Omer Veysel Cagatan"],"pdf_url":"https://arxiv.org/pdf/2308.16336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06677v2","updated":"2023-08-30T21:37:02Z","published":"2023-02-13T20:32:37Z","title":"System identification of neural systems: If we got it right, would we\n know?","summary":" Artificial neural networks are being proposed as models of parts of the\nbrain. The networks are compared to recordings of biological neurons, and good\nperformance in reproducing neural responses is considered to support the\nmodel's validity. A key question is how much this system identification\napproach tells us about brain computation. Does it validate one model\narchitecture over another? We evaluate the most commonly used comparison\ntechniques, such as a linear encoding model and centered kernel alignment, to\ncorrectly identify a model by replacing brain recordings with known ground\ntruth models. System identification performance is quite variable; it also\ndepends significantly on factors independent of the ground truth architecture,\nsuch as stimuli images. In addition, we show the limitations of using\nfunctional similarity scores in identifying higher-level architectural motifs.\n","authors":["Yena Han","Tomaso Poggio","Brian Cheung"],"pdf_url":"https://arxiv.org/pdf/2302.06677v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.02796v2","updated":"2023-08-30T21:35:48Z","published":"2022-10-06T10:14:41Z","title":"Hypernetwork approach to Bayesian MAML","summary":" The main goal of Few-Shot learning algorithms is to enable learning from\nsmall amounts of data. One of the most popular and elegant Few-Shot learning\napproaches is Model-Agnostic Meta-Learning (MAML). The main idea behind this\nmethod is to learn the shared universal weights of a meta-model, which are then\nadapted for specific tasks. However, the method suffers from over-fitting and\npoorly quantifies uncertainty due to limited data size. Bayesian approaches\ncould, in principle, alleviate these shortcomings by learning weight\ndistributions in place of point-wise weights. Unfortunately, previous\nmodifications of MAML are limited due to the simplicity of Gaussian posteriors,\nMAML-like gradient-based weight updates, or by the same structure enforced for\nuniversal and adapted weights.\n In this paper, we propose a novel framework for Bayesian MAML called\nBayesianHMAML, which employs Hypernetworks for weight updates. It learns the\nuniversal weights point-wise, but a probabilistic structure is added when\nadapted for specific tasks. In such a framework, we can use simple Gaussian\ndistributions or more complicated posteriors induced by Continuous Normalizing\nFlows.\n","authors":["Piotr Borycki","Piotr Kubacki","Marcin Przewięźlikowski","Tomasz Kuśmierczyk","Jacek Tabor","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2210.02796v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2205.15745"},{"id":"http://arxiv.org/abs/2308.16331v1","updated":"2023-08-30T21:34:33Z","published":"2023-08-30T21:34:33Z","title":"Symmetry Preservation in Hamiltonian Systems: Simulation and Learning","summary":" This work presents a general geometric framework for simulating and learning\nthe dynamics of Hamiltonian systems that are invariant under a Lie group of\ntransformations. This means that a group of symmetries is known to act on the\nsystem respecting its dynamics and, as a consequence, Noether's Theorem,\nconserved quantities are observed. We propose to simulate and learn the\nmappings of interest through the construction of $G$-invariant Lagrangian\nsubmanifolds, which are pivotal objects in symplectic geometry. A notable\nproperty of our constructions is that the simulated/learned dynamics also\npreserves the same conserved quantities as the original system, resulting in a\nmore faithful surrogate of the original dynamics than non-symmetry aware\nmethods, and in a more accurate predictor of non-observed trajectories.\nFurthermore, our setting is able to simulate/learn not only Hamiltonian flows,\nbut any Lie group-equivariant symplectic transformation. Our designs leverage\npivotal techniques and concepts in symplectic geometry and geometric mechanics:\nreduction theory, Noether's Theorem, Lagrangian submanifolds, momentum\nmappings, and coisotropic reduction among others. We also present methods to\nlearn Poisson transformations while preserving the underlying geometry and how\nto endow non-geometric integrators with geometric properties. Thus, this work\npresents a novel attempt to harness the power of symplectic and Poisson\ngeometry towards simulating and learning problems.\n","authors":["Miguel Vaquero","Jorge Cortés","David Martín de Diego"],"pdf_url":"https://arxiv.org/pdf/2308.16331v1.pdf","comment":"32 pages, 19 figures"},{"id":"http://arxiv.org/abs/2308.11155v2","updated":"2023-08-30T20:55:07Z","published":"2023-08-22T03:23:36Z","title":"xxMD: Benchmarking Neural Force Fields Using Extended Dynamics beyond\n Equilibrium","summary":" Neural force fields (NFFs) have gained prominence in computational chemistry\nas surrogate models, superseding quantum-chemistry calculations in ab initio\nmolecular dynamics. The prevalent benchmark for NFFs has been the MD17 dataset\nand its subsequent extension. These datasets predominantly comprise geometries\nfrom the equilibrium region of the ground electronic state potential energy\nsurface, sampling from direct adiabatic dynamics. However, many chemical\nreactions entail significant molecular deformations, notably bond breaking. We\ndemonstrate the constrained distribution of internal coordinates and energies\nin the MD17 datasets, underscoring their inadequacy for representing systems\nundergoing chemical reactions. Addressing this sampling limitation, we\nintroduce the xxMD (Extended Excited-state Molecular Dynamics) dataset, derived\nfrom non-adiabatic dynamics. This dataset encompasses energies and forces\nascertained from both multireference wave function theory and density\nfunctional theory. Furthermore, its nuclear configuration spaces authentically\ndepict chemical reactions, making xxMD a more chemically relevant dataset. Our\nre-assessment of equivariant models on the xxMD datasets reveals notably higher\nmean absolute errors than those reported for MD17 and its variants. This\nobservation underscores the challenges faced in crafting a generalizable NFF\nmodel with extrapolation capability. Our proposed xxMD-CASSCF and xxMD-DFT\ndatasets are available at https://github.com/zpengmei/xxMD.\n","authors":["Zihan Pengmei","Yinan Shu","Junyu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11155v2.pdf","comment":"19 pages, many figures. Data available at\n https://github.com/zpengmei/xxMD"},{"id":"http://arxiv.org/abs/2308.16316v1","updated":"2023-08-30T20:46:45Z","published":"2023-08-30T20:46:45Z","title":"Ten Years of Generative Adversarial Nets (GANs): A survey of the\n state-of-the-art","summary":" Since their inception in 2014, Generative Adversarial Networks (GANs) have\nrapidly emerged as powerful tools for generating realistic and diverse data\nacross various domains, including computer vision and other applied areas.\nConsisting of a discriminative network and a generative network engaged in a\nMinimax game, GANs have revolutionized the field of generative modeling. In\nFebruary 2018, GAN secured the leading spot on the ``Top Ten Global\nBreakthrough Technologies List'' issued by the Massachusetts Science and\nTechnology Review. Over the years, numerous advancements have been proposed,\nleading to a rich array of GAN variants, such as conditional GAN, Wasserstein\nGAN, CycleGAN, and StyleGAN, among many others. This survey aims to provide a\ngeneral overview of GANs, summarizing the latent architecture, validation\nmetrics, and application areas of the most widely recognized variants. We also\ndelve into recent theoretical developments, exploring the profound connection\nbetween the adversarial principle underlying GAN and Jensen-Shannon divergence,\nwhile discussing the optimality characteristics of the GAN framework. The\nefficiency of GAN variants and their model architectures will be evaluated\nalong with training obstacles as well as training solutions. In addition, a\ndetailed discussion will be provided, examining the integration of GANs with\nnewly developed deep learning frameworks such as Transformers, Physics-Informed\nNeural Networks, Large Language models, and Diffusion models. Finally, we\nreveal several issues as well as future research outlines in this field.\n","authors":["Tanujit Chakraborty","Ujjwal Reddy K S","Shraddha M. Naik","Madhurima Panja","Bayapureddy Manvitha"],"pdf_url":"https://arxiv.org/pdf/2308.16316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10474v2","updated":"2023-08-30T20:28:13Z","published":"2023-05-17T17:59:16Z","title":"Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models","summary":" Despite tremendous progress in generating high-quality images using diffusion\nmodels, synthesizing a sequence of animated frames that are both photorealistic\nand temporally coherent is still in its infancy. While off-the-shelf\nbillion-scale datasets for image generation are available, collecting similar\nvideo data of the same scale is still challenging. Also, training a video\ndiffusion model is computationally much more expensive than its image\ncounterpart. In this work, we explore finetuning a pretrained image diffusion\nmodel with video data as a practical solution for the video synthesis task. We\nfind that naively extending the image noise prior to video noise prior in video\ndiffusion leads to sub-optimal performance. Our carefully designed video noise\nprior leads to substantially better performance. Extensive experimental\nvalidation shows that our model, Preserve Your Own Correlation (PYoCo), attains\nSOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It\nalso achieves SOTA video generation quality on the small-scale UCF-101\nbenchmark with a $10\\times$ smaller model using significantly less computation\nthan the prior art.\n","authors":["Songwei Ge","Seungjun Nah","Guilin Liu","Tyler Poon","Andrew Tao","Bryan Catanzaro","David Jacobs","Jia-Bin Huang","Ming-Yu Liu","Yogesh Balaji"],"pdf_url":"https://arxiv.org/pdf/2305.10474v2.pdf","comment":"ICCV 2023. Project webpage:\n https://research.nvidia.com/labs/dir/pyoco"},{"id":"http://arxiv.org/abs/2303.12743v4","updated":"2023-08-30T20:26:25Z","published":"2023-03-20T07:42:48Z","title":"DR.CPO: Diversified and Realistic 3D Augmentation via Iterative\n Construction, Random Placement, and HPR Occlusion","summary":" In autonomous driving, data augmentation is commonly used for improving 3D\nobject detection. The most basic methods include insertion of copied objects\nand rotation and scaling of the entire training frame. Numerous variants have\nbeen developed as well. The existing methods, however, are considerably limited\nwhen compared to the variety of the real world possibilities. In this work, we\ndevelop a diversified and realistic augmentation method that can flexibly\nconstruct a whole-body object, freely locate and rotate the object, and apply\nself-occlusion and external-occlusion accordingly. To improve the diversity of\nthe whole-body object construction, we develop an iterative method that\nstochastically combines multiple objects observed from the real world into a\nsingle object. Unlike the existing augmentation methods, the constructed\nobjects can be randomly located and rotated in the training frame because\nproper occlusions can be reflected to the whole-body objects in the final step.\nFinally, proper self-occlusion at each local object level and\nexternal-occlusion at the global frame level are applied using the Hidden Point\nRemoval (HPR) algorithm that is computationally efficient. HPR is also used for\nadaptively controlling the point density of each object according to the\nobject's distance from the LiDAR. Experiment results show that the proposed\nDR.CPO algorithm is data-efficient and model-agnostic without incurring any\ncomputational overhead. Also, DR.CPO can improve mAP performance by 2.08% when\ncompared to the best 3D detection result known for KITTI dataset. The code is\navailable at https://github.com/SNU-DRL/DRCPO.git\n","authors":["Jungwook Shin","Jaeill Kim","Kyungeun Lee","Hyunghun Cho","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2303.12743v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15370v2","updated":"2023-08-30T20:20:23Z","published":"2023-08-29T15:06:47Z","title":"Multi-Response Heteroscedastic Gaussian Process Models and Their\n Inference","summary":" Despite the widespread utilization of Gaussian process models for versatile\nnonparametric modeling, they exhibit limitations in effectively capturing\nabrupt changes in function smoothness and accommodating relationships with\nheteroscedastic errors. Addressing these shortcomings, the heteroscedastic\nGaussian process (HeGP) regression seeks to introduce flexibility by\nacknowledging the variability of residual variances across covariates in the\nregression model. In this work, we extend the HeGP concept, expanding its scope\nbeyond regression tasks to encompass classification and state-space models. To\nachieve this, we propose a novel framework where the Gaussian process is\ncoupled with a covariate-induced precision matrix process, adopting a mixture\nformulation. This approach enables the modeling of heteroscedastic covariance\nfunctions across covariates. To mitigate the computational challenges posed by\nsampling, we employ variational inference to approximate the posterior and\nfacilitate posterior predictive modeling. Additionally, our training process\nleverages an EM algorithm featuring closed-form M-step updates to efficiently\nevaluate the heteroscedastic covariance function. A notable feature of our\nmodel is its consistent performance on multivariate responses, accommodating\nvarious types (continuous or categorical) seamlessly. Through a combination of\nsimulations and real-world applications in climatology, we illustrate the\nmodel's prowess and advantages. By overcoming the limitations of traditional\nGaussian process models, our proposed framework offers a robust and versatile\ntool for a wide array of applications.\n","authors":["Taehee Lee","Jun S. Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15370v2.pdf","comment":"submitted to the Journal of the American Statistical Association\n (JASA)"},{"id":"http://arxiv.org/abs/2307.05628v3","updated":"2023-08-30T20:16:55Z","published":"2023-07-11T06:30:43Z","title":"DNAGPT: A Generalized Pre-trained Tool for Versatile DNA Sequence\n Analysis Tasks","summary":" Pre-trained large language models demonstrate potential in extracting\ninformation from DNA sequences, yet adapting to a variety of tasks and data\nmodalities remains a challenge. To address this, we propose DNAGPT, a\ngeneralized DNA pre-training model trained on over 200 billion base pairs from\nall mammals. By enhancing the classic GPT model with a binary classification\ntask (DNA sequence order), a numerical regression task (guanine-cytosine\ncontent prediction), and a comprehensive token language, DNAGPT can handle\nversatile DNA analysis tasks while processing both sequence and numerical data.\nOur evaluation of genomic signal and region recognition, mRNA abundance\nregression, and artificial genomes generation tasks demonstrates DNAGPT's\nsuperior performance compared to existing models designed for specific\ndownstream tasks, benefiting from pre-training using the newly designed model\nstructure.\n","authors":["Daoan Zhang","Weitong Zhang","Yu Zhao","Jianguo Zhang","Bing He","Chenchen Qin","Jianhua Yao"],"pdf_url":"https://arxiv.org/pdf/2307.05628v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06720v2","updated":"2023-08-30T20:12:12Z","published":"2023-04-13T17:59:55Z","title":"Expressive Text-to-Image Generation with Rich Text","summary":" Plain text has become a prevalent interface for text-to-image synthesis.\nHowever, its limited customization options hinder users from accurately\ndescribing desired outputs. For example, plain text makes it hard to specify\ncontinuous quantities, such as the precise RGB color value or importance of\neach word. Furthermore, creating detailed text prompts for complex scenes is\ntedious for humans to write and challenging for text encoders to interpret. To\naddress these challenges, we propose using a rich-text editor supporting\nformats such as font style, size, color, and footnote. We extract each word's\nattributes from rich text to enable local style control, explicit token\nreweighting, precise color rendering, and detailed region synthesis. We achieve\nthese capabilities through a region-based diffusion process. We first obtain\neach word's region based on attention maps of a diffusion process using plain\ntext. For each region, we enforce its text attributes by creating\nregion-specific detailed prompts and applying region-specific guidance, and\nmaintain its fidelity against plain-text generation through region-based\ninjections. We present various examples of image generation from rich text and\ndemonstrate that our method outperforms strong baselines with quantitative\nevaluations.\n","authors":["Songwei Ge","Taesung Park","Jun-Yan Zhu","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2304.06720v2.pdf","comment":"ICCV 2023. Project webpage: https://rich-text-to-image.github.io/"},{"id":"http://arxiv.org/abs/2304.13455v4","updated":"2023-08-30T19:44:41Z","published":"2023-04-26T11:27:34Z","title":"From Chaos Comes Order: Ordering Event Representations for Object\n Recognition and Detection","summary":" Today, state-of-the-art deep neural networks that process events first\nconvert them into dense, grid-like input representations before using an\noff-the-shelf network. However, selecting the appropriate representation for\nthe task traditionally requires training a neural network for each\nrepresentation and selecting the best one based on the validation score, which\nis very time-consuming. This work eliminates this bottleneck by selecting\nrepresentations based on the Gromov-Wasserstein Discrepancy (GWD) between raw\nevents and their representation. It is about 200 times faster to compute than\ntraining a neural network and preserves the task performance ranking of event\nrepresentations across multiple representations, network backbones, datasets,\nand tasks. Thus finding representations with high task scores is equivalent to\nfinding representations with a low GWD. We use this insight to, for the first\ntime, perform a hyperparameter search on a large family of event\nrepresentations, revealing new and powerful representations that exceed the\nstate-of-the-art. Our optimized representations outperform existing\nrepresentations by 1.7 mAP on the 1 Mpx dataset and 0.3 mAP on the Gen1\ndataset, two established object detection benchmarks, and reach a 3.8% higher\nclassification score on the mini N-ImageNet benchmark. Moreover, we outperform\nstate-of-the-art by 2.1 mAP on Gen1 and state-of-the-art feed-forward methods\nby 6.0 mAP on the 1 Mpx datasets. This work opens a new unexplored field of\nexplicit representation optimization for event-based learning.\n","authors":["Nikola Zubić","Daniel Gehrig","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2304.13455v4.pdf","comment":"15 pages, 11 figures, 2 tables, ICCV 2023 Camera Ready paper"},{"id":"http://arxiv.org/abs/2211.00646v3","updated":"2023-08-30T19:25:14Z","published":"2022-11-01T00:40:09Z","title":"Learning Melanocytic Cell Masks from Adjacent Stained Tissue","summary":" Melanoma is one of the most aggressive forms of skin cancer, causing a large\nproportion of skin cancer deaths. However, melanoma diagnoses by pathologists\nshows low interrater reliability. As melanoma is a cancer of the melanocyte,\nthere is a clear need to develop a melanocytic cell segmentation tool that is\nagnostic to pathologist variability and automates pixel-level annotation.\nGigapixel-level pathologist labeling, however, is impractical. Herein, we\npropose a means to train deep neural networks for melanocytic cell segmentation\nfrom hematoxylin and eosin (H&E) stained sections and paired\nimmunohistochemistry (IHC) of adjacent tissue sections, achieving a mean IOU of\n0.64 despite imperfect ground-truth labels.\n","authors":["Mikio Tada","Ursula E. Lang","Iwei Yeh","Maria L. Wei","Michael J. Keiser"],"pdf_url":"https://arxiv.org/pdf/2211.00646v3.pdf","comment":"Accepted at Medical Image Learning with Limited & Noisy Data\n Workshop, Medical Image Computing and Computer Assisted Interventions\n (MICCAI) 2022"},{"id":"http://arxiv.org/abs/2308.16279v1","updated":"2023-08-30T19:13:10Z","published":"2023-08-30T19:13:10Z","title":"Classification of Anomalies in Telecommunication Network KPI Time Series","summary":" The increasing complexity and scale of telecommunication networks have led to\na growing interest in automated anomaly detection systems. However, the\nclassification of anomalies detected on network Key Performance Indicators\n(KPI) has received less attention, resulting in a lack of information about\nanomaly characteristics and classification processes. To address this gap, this\npaper proposes a modular anomaly classification framework. The framework\nassumes separate entities for the anomaly classifier and the detector, allowing\nfor a distinct treatment of anomaly detection and classification tasks on time\nseries. The objectives of this study are (1) to develop a time series simulator\nthat generates synthetic time series resembling real-world network KPI\nbehavior, (2) to build a detection model to identify anomalies in the time\nseries, (3) to build classification models that accurately categorize detected\nanomalies into predefined classes (4) to evaluate the classification framework\nperformance on simulated and real-world network KPI time series. This study has\ndemonstrated the good performance of the anomaly classification models trained\non simulated anomalies when applied to real-world network time series data.\n","authors":["Korantin Bordeau-Aubert","Justin Whatley","Sylvain Nadeau","Tristan Glatard","Brigitte Jaumard"],"pdf_url":"https://arxiv.org/pdf/2308.16279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16274v1","updated":"2023-08-30T19:04:34Z","published":"2023-08-30T19:04:34Z","title":"Learning Diverse Features in Vision Transformers for Improved\n Generalization","summary":" Deep learning models often rely only on a small set of features even when\nthere is a rich set of predictive signals in the training data. This makes\nmodels brittle and sensitive to distribution shifts. In this work, we first\nexamine vision transformers (ViTs) and find that they tend to extract robust\nand spurious features with distinct attention heads. As a result of this\nmodularity, their performance under distribution shifts can be significantly\nimproved at test time by pruning heads corresponding to spurious features,\nwhich we demonstrate using an \"oracle selection\" on validation data. Second, we\npropose a method to further enhance the diversity and complementarity of the\nlearned features by encouraging orthogonality of the attention heads' input\ngradients. We observe improved out-of-distribution performance on diagnostic\nbenchmarks (MNIST-CIFAR, Waterbirds) as a consequence of the enhanced diversity\nof features and the pruning of undesirable heads.\n","authors":["Armand Mihai Nicolicioiu","Andrei Liviu Nicolicioiu","Bogdan Alexe","Damien Teney"],"pdf_url":"https://arxiv.org/pdf/2308.16274v1.pdf","comment":"2023 ICML Workshop on Spurious Correlations, Invariance and Stability"},{"id":"http://arxiv.org/abs/2308.16272v1","updated":"2023-08-30T19:02:24Z","published":"2023-08-30T19:02:24Z","title":"A numerical approach for the fractional Laplacian via deep neural\n networks","summary":" We consider the fractional elliptic problem with Dirichlet boundary\nconditions on a bounded and convex domain $D$ of $\\mathbb{R}^d$, with $d \\geq\n2$. In this paper, we perform a stochastic gradient descent algorithm that\napproximates the solution of the fractional problem via Deep Neural Networks.\nAdditionally, we provide four numerical examples to test the efficiency of the\nalgorithm, and each example will be studied for many values of $\\alpha \\in\n(1,2)$ and $d \\geq 2$.\n","authors":["Nicolás Valenzuela"],"pdf_url":"https://arxiv.org/pdf/2308.16272v1.pdf","comment":"32 pages, 21 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.16271v1","updated":"2023-08-30T19:02:17Z","published":"2023-08-30T19:02:17Z","title":"Emergence of Segmentation with Minimalistic White-Box Transformers","summary":" Transformer-like models for vision tasks have recently proven effective for a\nwide range of downstream applications such as segmentation and detection.\nPrevious works have shown that segmentation properties emerge in vision\ntransformers (ViTs) trained using self-supervised methods such as DINO, but not\nin those trained on supervised classification tasks. In this study, we probe\nwhether segmentation emerges in transformer-based models solely as a result of\nintricate self-supervised learning mechanisms, or if the same emergence can be\nachieved under much broader conditions through proper design of the model\narchitecture. Through extensive experimental results, we demonstrate that when\nemploying a white-box transformer-like architecture known as CRATE, whose\ndesign explicitly models and pursues low-dimensional structures in the data\ndistribution, segmentation properties, at both the whole and parts levels,\nalready emerge with a minimalistic supervised training recipe. Layer-wise\nfiner-grained analysis reveals that the emergent properties strongly\ncorroborate the designed mathematical functions of the white-box network. Our\nresults suggest a path to design white-box foundation models that are\nsimultaneously highly performant and mathematically fully interpretable. Code\nis at \\url{https://github.com/Ma-Lab-Berkeley/CRATE}.\n","authors":["Yaodong Yu","Tianzhe Chu","Shengbang Tong","Ziyang Wu","Druv Pai","Sam Buchanan","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2308.16271v1.pdf","comment":"Code: https://github.com/Ma-Lab-Berkeley/CRATE"},{"id":"http://arxiv.org/abs/2304.14994v2","updated":"2023-08-30T18:52:03Z","published":"2023-04-28T17:28:18Z","title":"A Stable and Scalable Method for Solving Initial Value PDEs with Neural\n Networks","summary":" Unlike conventional grid and mesh based methods for solving partial\ndifferential equations (PDEs), neural networks have the potential to break the\ncurse of dimensionality, providing approximate solutions to problems where\nusing classical solvers is difficult or impossible. While global minimization\nof the PDE residual over the network parameters works well for boundary value\nproblems, catastrophic forgetting impairs the applicability of this approach to\ninitial value problems (IVPs). In an alternative local-in-time approach, the\noptimization problem can be converted into an ordinary differential equation\n(ODE) on the network parameters and the solution propagated forward in time;\nhowever, we demonstrate that current methods based on this approach suffer from\ntwo key issues. First, following the ODE produces an uncontrolled growth in the\nconditioning of the problem, ultimately leading to unacceptably large numerical\nerrors. Second, as the ODE methods scale cubically with the number of model\nparameters, they are restricted to small neural networks, significantly\nlimiting their ability to represent intricate PDE initial conditions and\nsolutions. Building on these insights, we develop Neural IVP, an ODE based IVP\nsolver which prevents the network from getting ill-conditioned and runs in time\nlinear in the number of parameters, enabling us to evolve the dynamics of\nchallenging PDEs with neural networks.\n","authors":["Marc Finzi","Andres Potapczynski","Matthew Choptuik","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2304.14994v2.pdf","comment":"ICLR 2023. Code available at https://github.com/mfinzi/neural-ivp"},{"id":"http://arxiv.org/abs/2308.16259v1","updated":"2023-08-30T18:34:55Z","published":"2023-08-30T18:34:55Z","title":"Materials Informatics Transformer: A Language Model for Interpretable\n Materials Properties Prediction","summary":" Recently, the remarkable capabilities of large language models (LLMs) have\nbeen illustrated across a variety of research domains such as natural language\nprocessing, computer vision, and molecular modeling. We extend this paradigm by\nutilizing LLMs for material property prediction by introducing our model\nMaterials Informatics Transformer (MatInFormer). Specifically, we introduce a\nnovel approach that involves learning the grammar of crystallography through\nthe tokenization of pertinent space group information. We further illustrate\nthe adaptability of MatInFormer by incorporating task-specific data pertaining\nto Metal-Organic Frameworks (MOFs). Through attention visualization, we uncover\nthe key features that the model prioritizes during property prediction. The\neffectiveness of our proposed model is empirically validated across 14 distinct\ndatasets, hereby underscoring its potential for high throughput screening\nthrough accurate material property prediction.\n","authors":["Hongshuo Huang","Rishikesh Magar","Changwen Xu","Amir Bariti Farimani"],"pdf_url":"https://arxiv.org/pdf/2308.16259v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.14480v2","updated":"2023-08-30T15:33:01Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.15851v1","updated":"2023-08-30T08:35:31Z","published":"2023-08-30T08:35:31Z","title":"Prompting Vision Language Model with Knowledge from Large Language Model\n for Knowledge-Based VQA","summary":" Knowledge-based visual question answering is a very challenging and widely\nconcerned task. Previous methods adopts the implicit knowledge in large\nlanguage models (LLM) to achieve excellent results, but we argue that existing\nmethods may suffer from biasing understanding of the image and insufficient\nknowledge to solve the problem. In this paper, we propose PROOFREAD -PROmpting\nvision language model with knOwledge From laRgE lAnguage moDel, a novel,\nlightweight and efficient kowledge-based VQA framework, which make the vision\nlanguage model and the large language model cooperate to give full play to\ntheir respective strengths and bootstrap each other. In detail, our proposed\nmethod uses LLM to obtain knowledge explicitly, uses the vision language model\nwhich can see the image to get the knowledge answer, and introduces knowledge\nperceiver to filter out knowledge that is harmful for getting the correct final\nanswer. Experimental results on two datasets prove the effectiveness of our\napproach. Our method outperforms all state-of-the-art methods on the A-OKVQA\ndataset in two settings and also achieves relatively good performance on the\nOKVQA dataset.\n","authors":["Yang Zhou","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.15851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16250v1","updated":"2023-08-30T18:14:30Z","published":"2023-08-30T18:14:30Z","title":"It Takes a Village: Multidisciplinarity and Collaboration for the\n Development of Embodied Conversational Agents","summary":" Embodied conversational agent (ECA) development is a time-consuming and\ncostly process that calls for knowledge in a plethora of different and not\nnecessarily adjacent disciplines. Engaging in activities outside of one's core\nresearch to acquire peripheral skills can impede innovation and potentially\nrestrict the outcomes within the boundaries of those acquired skills. A\nproposal to tackle this challenge is creating collaborative communities of\nexperts from the contributing disciplines to the field of ECAs that via clearly\ndefined roles, expectations and communication channels can help extend the\nfield of ECA research.\n","authors":["Danai Korre"],"pdf_url":"https://arxiv.org/pdf/2308.16250v1.pdf","comment":"5 pages, 1 figure, ACM CUI 2023: Proceedings of the 5th Conference on\n Conversational User Interfaces - Is CUI ready yet?, This paper discusses the\n challenges of ECA development and how they can be tackled via\n multidisciplinary collaboration"},{"id":"http://arxiv.org/abs/2308.16215v1","updated":"2023-08-30T16:44:38Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control","summary":" Lossy video compression is commonly used when transmitting and storing video\ndata. Unified video codecs (e.g., H.264 or H.265) remain the \\emph{de facto}\nstandard, despite the availability of advanced (neural) compression approaches.\nTransmitting videos in the face of dynamic network bandwidth conditions\nrequires video codecs to adapt to vastly different compression strengths. Rate\ncontrol modules augment the codec's compression such that bandwidth constraints\nare satisfied and video distortion is minimized. While, both standard video\ncodes and their rate control modules are developed to minimize video distortion\nw.r.t. human quality assessment, preserving the downstream performance of deep\nvision models is not considered. In this paper, we present the first end-to-end\nlearnable deep video codec control considering both bandwidth constraints and\ndownstream vision performance, while not breaking existing standardization. We\ndemonstrate for two common vision tasks (semantic segmentation and optical flow\nestimation) and on two different datasets that our deep codec control better\npreserves downstream performance than using 2-pass average bit rate control\nwhile meeting dynamic bandwidth constraints and adhering to standardizations.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v1.pdf","comment":"22 pages, 26 figures, 6 tables"}]},"2023-08-31T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.16911v1","updated":"2023-08-31T17:59:46Z","published":"2023-08-31T17:59:46Z","title":"PointLLM: Empowering Large Language Models to Understand Point Clouds","summary":" The unprecedented advancements in Large Language Models (LLMs) have created a\nprofound impact on natural language processing but are yet to fully embrace the\nrealm of 3D understanding. This paper introduces PointLLM, a preliminary effort\nto fill this gap, thereby enabling LLMs to understand point clouds and offering\na new avenue beyond 2D visual data. PointLLM processes colored object point\nclouds with human instructions and generates contextually appropriate\nresponses, illustrating its grasp of point clouds and common sense.\nSpecifically, it leverages a point cloud encoder with a powerful LLM to\neffectively fuse geometric, appearance, and linguistic information. We collect\na novel dataset comprising 660K simple and 70K complex point-text instruction\npairs to enable a two-stage training strategy: initially aligning latent spaces\nand subsequently instruction-tuning the unified model. To rigorously evaluate\nour model's perceptual abilities and its generalization capabilities, we\nestablish two benchmarks: Generative 3D Object Classification and 3D Object\nCaptioning, assessed through three different methods, including human\nevaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment\nresults show that PointLLM demonstrates superior performance over existing 2D\nbaselines. Remarkably, in human-evaluated object captioning tasks, PointLLM\noutperforms human annotators in over 50% of the samples. Codes, datasets, and\nbenchmarks are available at https://github.com/OpenRobotLab/PointLLM .\n","authors":["Runsen Xu","Xiaolong Wang","Tai Wang","Yilun Chen","Jiangmiao Pang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.16911v1.pdf","comment":"19 pages. Empowering large language models with 3D point cloud\n understanding, accompanied by a novel dataset and carefully designed\n benchmarks. Project page: https://runsenxu.com/projects/PointLLM"},{"id":"http://arxiv.org/abs/2308.16898v1","updated":"2023-08-31T17:57:50Z","published":"2023-08-31T17:57:50Z","title":"Transformers as Support Vector Machines","summary":" Since its inception in \"Attention Is All You Need\", transformer architecture\nhas led to revolutionary advancements in NLP. The attention layer within the\ntransformer admits a sequence of input tokens $X$ and makes them interact\nthrough pairwise similarities computed as softmax$(XQK^\\top X^\\top)$, where\n$(K,Q)$ are the trainable key-query parameters. In this work, we establish a\nformal equivalence between the optimization geometry of self-attention and a\nhard-margin SVM problem that separates optimal input tokens from non-optimal\ntokens using linear constraints on the outer-products of token pairs. This\nformalism allows us to characterize the implicit bias of 1-layer transformers\noptimized with gradient descent: (1) Optimizing the attention layer with\nvanishing regularization, parameterized by $(K,Q)$, converges in direction to\nan SVM solution minimizing the nuclear norm of the combined parameter\n$W=KQ^\\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm\nobjective. We characterize this convergence, highlighting that it can occur\ntoward locally-optimal directions rather than global ones. (2) Complementing\nthis, we prove the local/global directional convergence of gradient descent\nunder suitable geometric conditions. Importantly, we show that\nover-parameterization catalyzes global convergence by ensuring the feasibility\nof the SVM problem and by guaranteeing a benign optimization landscape devoid\nof stationary points. (3) While our theory applies primarily to linear\nprediction heads, we propose a more general SVM equivalence that predicts the\nimplicit bias with nonlinear heads. Our findings are applicable to arbitrary\ndatasets and their validity is verified via experiments. We also introduce\nseveral open problems and research directions. We believe these findings\ninspire the interpretation of transformers as a hierarchy of SVMs that\nseparates and selects optimal tokens.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Christos Thrampoulidis","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2308.16898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16890v1","updated":"2023-08-31T17:52:04Z","published":"2023-08-31T17:52:04Z","title":"TouchStone: Evaluating Vision-Language Models by Language Models","summary":" Large vision-language models (LVLMs) have recently witnessed rapid\nadvancements, exhibiting a remarkable capacity for perceiving, understanding,\nand processing visual information by connecting visual receptor with large\nlanguage models (LLMs). However, current assessments mainly focus on\nrecognizing and reasoning abilities, lacking direct evaluation of\nconversational skills and neglecting visual storytelling abilities. In this\npaper, we propose an evaluation method that uses strong LLMs as judges to\ncomprehensively evaluate the various abilities of LVLMs. Firstly, we construct\na comprehensive visual dialogue dataset TouchStone, consisting of open-world\nimages and questions, covering five major categories of abilities and 27\nsubtasks. This dataset not only covers fundamental recognition and\ncomprehension but also extends to literary creation. Secondly, by integrating\ndetailed image annotations we effectively transform the multimodal input\ncontent into a form understandable by LLMs. This enables us to employ advanced\nLLMs for directly evaluating the quality of the multimodal dialogue without\nrequiring human intervention. Through validation, we demonstrate that powerful\nLVLMs, such as GPT-4, can effectively score dialogue quality by leveraging\ntheir textual capabilities alone, aligning with human preferences. We hope our\nwork can serve as a touchstone for LVLMs' evaluation and pave the way for\nbuilding stronger LVLMs. The evaluation code is available at\nhttps://github.com/OFA-Sys/TouchStone.\n","authors":["Shuai Bai","Shusheng Yang","Jinze Bai","Peng Wang","Xingxuan Zhang","Junyang Lin","Xinggang Wang","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16890v1.pdf","comment":"https://github.com/OFA-Sys/TouchStone"},{"id":"http://arxiv.org/abs/2308.16884v1","updated":"2023-08-31T17:43:08Z","published":"2023-08-31T17:43:08Z","title":"The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122\n Language Variants","summary":" We present Belebele, a multiple-choice machine reading comprehension (MRC)\ndataset spanning 122 language variants. Significantly expanding the language\ncoverage of natural language understanding (NLU) benchmarks, this dataset\nenables the evaluation of text models in high-, medium-, and low-resource\nlanguages. Each question is based on a short passage from the Flores-200\ndataset and has four multiple-choice answers. The questions were carefully\ncurated to discriminate between models with different levels of general\nlanguage comprehension. The English dataset on its own proves difficult enough\nto challenge state-of-the-art language models. Being fully parallel, this\ndataset enables direct comparison of model performance across all languages. We\nuse this dataset to evaluate the capabilities of multilingual masked language\nmodels (MLMs) and large language models (LLMs). We present extensive results\nand find that despite significant cross-lingual transfer in English-centric\nLLMs, much smaller MLMs pretrained on balanced multilingual data still\nunderstand far more languages. We also observe that larger vocabulary size and\nconscious vocabulary construction correlate with better performance on\nlow-resource languages. Overall, Belebele opens up new avenues for evaluating\nand analyzing the multilingual capabilities of NLP systems.\n","authors":["Lucas Bandarkar","Davis Liang","Benjamin Muller","Mikel Artetxe","Satya Narayan Shukla","Donald Husa","Naman Goyal","Abhinandan Krishnan","Luke Zettlemoyer","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2308.16884v1.pdf","comment":"27 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.16871v1","updated":"2023-08-31T17:20:50Z","published":"2023-08-31T17:20:50Z","title":"The Gender-GAP Pipeline: A Gender-Aware Polyglot Pipeline for Gender\n Characterisation in 55 Languages","summary":" Gender biases in language generation systems are challenging to mitigate. One\npossible source for these biases is gender representation disparities in the\ntraining and evaluation data. Despite recent progress in documenting this\nproblem and many attempts at mitigating it, we still lack shared methodology\nand tooling to report gender representation in large datasets. Such\nquantitative reporting will enable further mitigation, e.g., via data\naugmentation. This paper describes the Gender-GAP Pipeline (for Gender-Aware\nPolyglot Pipeline), an automatic pipeline to characterize gender representation\nin large-scale datasets for 55 languages. The pipeline uses a multilingual\nlexicon of gendered person-nouns to quantify the gender representation in text.\nWe showcase it to report gender representation in WMT training data and\ndevelopment data for the News task, confirming that current data is skewed\ntowards masculine representation. Having unbalanced datasets may indirectly\noptimize our systems towards outperforming one gender over the others. We\nsuggest introducing our gender quantification pipeline in current datasets and,\nideally, modifying them toward a balanced representation.\n","authors":["Benjamin Muller","Belen Alastruey","Prangthip Hansanti","Elahe Kalbassi","Christophe Ropers","Eric Michael Smith","Adina Williams","Luke Zettlemoyer","Pierre Andrews","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2308.16871v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2307.11764v2","updated":"2023-08-31T17:09:23Z","published":"2023-07-14T17:24:15Z","title":"Sensi-BERT: Towards Sensitivity Driven Fine-Tuning for\n Parameter-Efficient BERT","summary":" Large pre-trained language models have recently gained significant traction\ndue to their improved performance on various down-stream tasks like text\nclassification and question answering, requiring only few epochs of\nfine-tuning. However, their large model sizes often prohibit their applications\non resource-constrained edge devices. Existing solutions of yielding\nparameter-efficient BERT models largely rely on compute-exhaustive training and\nfine-tuning. Moreover, they often rely on additional compute heavy models to\nmitigate the performance gap. In this paper, we present Sensi-BERT, a\nsensitivity driven efficient fine-tuning of BERT models that can take an\noff-the-shelf pre-trained BERT model and yield highly parameter-efficient\nmodels for downstream tasks. In particular, we perform sensitivity analysis to\nrank each individual parameter tensor, that then is used to trim them\naccordingly during fine-tuning for a given parameter or FLOPs budget. Our\nexperiments show the efficacy of Sensi-BERT across different downstream tasks\nincluding MNLI, QQP, QNLI, SST-2 and SQuAD, showing better performance at\nsimilar or smaller parameter budget compared to various alternatives.\n","authors":["Souvik Kundu","Sharath Nittur Sridhar","Maciej Szankin","Sairam Sundaresan"],"pdf_url":"https://arxiv.org/pdf/2307.11764v2.pdf","comment":"6 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.16824v1","updated":"2023-08-31T15:53:51Z","published":"2023-08-31T15:53:51Z","title":"Can Programming Languages Boost Each Other via Instruction Tuning?","summary":" When human programmers have mastered a programming language, it would be\neasier when they learn a new programming language. In this report, we focus on\nexploring whether programming languages can boost each other during the\ninstruction fine-tuning phase of code large language models. We conduct\nextensive experiments of 8 popular programming languages (Python, JavaScript,\nTypeScript, C, C++, Java, Go, HTML) on StarCoder. Results demonstrate that\nprogramming languages can significantly improve each other. For example,\nCodeM-Python 15B trained on Python is able to increase Java by an absolute\n17.95% pass@1 on HumanEval-X. More surprisingly, we found that CodeM-HTML 7B\ntrained on the HTML corpus can improve Java by an absolute 15.24% pass@1. Our\ntraining data is released at https://github.com/NL2Code/CodeM.\n","authors":["Daoguang Zan","Ailun Yu","Bo Shen","Jiaxin Zhang","Taihong Chen","Bing Geng","Bei Chen","Jichuan Ji","Yafen Yao","Yongji Wang","Qianxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16824v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2211.11483v3","updated":"2023-08-31T15:43:56Z","published":"2022-11-21T14:18:25Z","title":"Deanthropomorphising NLP: Can a Language Model Be Conscious?","summary":" This work is intended as a voice in the discussion over previous claims that\na pretrained large language model (LLM) based on the Transformer model\narchitecture can be sentient. Such claims have been made concerning the LaMDA\nmodel and also concerning the current wave of LLM-powered chatbots, such as\nChatGPT. This claim, if confirmed, would have serious ramifications in the\nNatural Language Processing (NLP) community due to wide-spread use of similar\nmodels. However, here we take the position that such a large language model\ncannot be sentient, or conscious, and that LaMDA in particular exhibits no\nadvances over other similar models that would qualify it. We justify this by\nanalysing the Transformer architecture through Integrated Information Theory of\nconsciousness. We see the claims of sentience as part of a wider tendency to\nuse anthropomorphic language in NLP reporting. Regardless of the veracity of\nthe claims, we consider this an opportune moment to take stock of progress in\nlanguage modelling and consider the ethical implications of the task. In order\nto make this work helpful for readers outside the NLP community, we also\npresent the necessary background in language modelling.\n","authors":["Matthew Shardlow","Piotr Przybyła"],"pdf_url":"https://arxiv.org/pdf/2211.11483v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09312v2","updated":"2023-08-31T15:32:01Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks, such as Reddit discussions. In contrast to traditional comment-only\nmethods, our approach to labelling a comment as hate speech involves a holistic\nanalysis of text and images grounded in the discussion context. This is done by\nleveraging graph transformers to capture the contextual relationships in the\nentire discussion surrounding a comment and grounding the interwoven fusion\nlayers that combine individual comments' text and image embeddings instead of\nprocessing modalities separately. We compare the performance of our model to\nbaselines that only process individual comments and conduct extensive ablation\nstudies. To evaluate our work, we present a new dataset, HatefulDiscussions,\ncomprising complete multi-modal discussions from multiple online communities on\nReddit. We conclude with future work for multimodal solutions to deliver social\nvalue in online contexts, arguing that capturing a holistic view of a\nconversation significantly advances the effort to detect anti-social behaviour.\n","authors":["Liam Hebert","Gaurav Sahu","Yuxuan Guo","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2308.16797v1","updated":"2023-08-31T15:19:28Z","published":"2023-08-31T15:19:28Z","title":"Simple LLM Prompting is State-of-the-Art for Robust and Multilingual\n Dialogue Evaluation","summary":" Despite significant research effort in the development of automatic dialogue\nevaluation metrics, little thought is given to evaluating dialogues other than\nin English. At the same time, ensuring metrics are invariant to semantically\nsimilar responses is also an overlooked topic. In order to achieve the desired\nproperties of robustness and multilinguality for dialogue evaluation metrics,\nwe propose a novel framework that takes advantage of the strengths of current\nevaluation models with the newly-established paradigm of prompting Large\nLanguage Models (LLMs). Empirical results show our framework achieves state of\nthe art results in terms of mean Spearman correlation scores across several\nbenchmarks and ranks first place on both the Robust and Multilingual tasks of\nthe DSTC11 Track 4 \"Automatic Evaluation Metrics for Open-Domain Dialogue\nSystems\", proving the evaluation capabilities of prompted LLMs.\n","authors":["John Mendonça","Patrícia Pereira","João Paulo Carvalho","Alon Lavie","Isabel Trancoso"],"pdf_url":"https://arxiv.org/pdf/2308.16797v1.pdf","comment":"DSTC11 best paper for Track 4"},{"id":"http://arxiv.org/abs/2308.16795v1","updated":"2023-08-31T15:15:26Z","published":"2023-08-31T15:15:26Z","title":"Towards Multilingual Automatic Dialogue Evaluation","summary":" The main limiting factor in the development of robust multilingual dialogue\nevaluation metrics is the lack of multilingual data and the limited\navailability of open sourced multilingual dialogue systems. In this work, we\npropose a workaround for this lack of data by leveraging a strong multilingual\npretrained LLM and augmenting existing English dialogue data using Machine\nTranslation. We empirically show that the naive approach of finetuning a\npretrained multilingual encoder model with translated data is insufficient to\noutperform the strong baseline of finetuning a multilingual model with only\nsource data. Instead, the best approach consists in the careful curation of\ntranslated data using MT Quality Estimation metrics, excluding low quality\ntranslations that hinder its performance.\n","authors":["John Mendonça","Alon Lavie","Isabel Trancoso"],"pdf_url":"https://arxiv.org/pdf/2308.16795v1.pdf","comment":"SIGDIAL23"},{"id":"http://arxiv.org/abs/2308.16770v1","updated":"2023-08-31T14:47:00Z","published":"2023-08-31T14:47:00Z","title":"Enhancing PLM Performance on Labour Market Tasks via Instruction-based\n Finetuning and Prompt-tuning with Rules","summary":" The increased digitization of the labour market has given researchers,\neducators, and companies the means to analyze and better understand the labour\nmarket. However, labour market resources, although available in high volumes,\ntend to be unstructured, and as such, research towards methodologies for the\nidentification, linking, and extraction of entities becomes more and more\nimportant. Against the backdrop of this quest for better labour market\nrepresentations, resource constraints and the unavailability of large-scale\nannotated data cause a reliance on human domain experts. We demonstrate the\neffectiveness of prompt-based tuning of pre-trained language models (PLM) in\nlabour market specific applications. Our results indicate that cost-efficient\nmethods such as PTR and instruction tuning without exemplars can significantly\nincrease the performance of PLMs on downstream labour market applications\nwithout introducing additional model layers, manual annotations, and data\naugmentation.\n","authors":["Jarno Vrolijk","David Graus"],"pdf_url":"https://arxiv.org/pdf/2308.16770v1.pdf","comment":"accepted for publication at RecSys in HR 2023"},{"id":"http://arxiv.org/abs/2308.16763v1","updated":"2023-08-31T14:31:48Z","published":"2023-08-31T14:31:48Z","title":"Ladder-of-Thought: Using Knowledge as Steps to Elevate Stance Detection","summary":" Chain-of-Thought Prompting (CoT) reinforces the reasoning capabilities of\nLarge Language Models (LLMs) through the generation of intermediate rationales.\nHowever, these enhancements predominantly benefit large-scale models, leaving\nsmall LMs without significant performance improvements when directly applying\nCoT. Despite the advanced reasoning capabilities of LLMs, CoT relies primarily\non their pre-trained internal knowledge. The external knowledge that is\npreviously unknown to the model remains unexploited. This omission becomes\npronounced in tasks such as stance detection, where the external background\nknowledge plays a pivotal role. Additionally, the large-scale architecture of\nLLMs inevitably present efficiency challenges during deployment. To address\nthese challenges, we introduce the Ladder-of-Thought (LoT) for stance\ndetection. Grounded in a dual-phase Cascaded Optimization framework, LoT\ndirects the model to incorporate high-quality external knowledge, enhancing the\nintermediate rationales it generates. These bolstered rationales subsequently\nserve as the foundation for more precise predictions - akin to how a ladder\nfacilitates reaching elevated goals. LoT achieves a balance between efficiency\nand accuracy, making it an adaptable and efficient framework for stance\ndetection. Our empirical evaluations underscore LoT's effectiveness, marking a\n16% improvement over ChatGPT and a 10% enhancement compared to ChatGPT with\nCoT.\n","authors":["Kairui Hu","Ming Yan","Joey Tianyi Zhou","Ivor W. Tsang","Wen Haw Chong","Yong Keong Yap"],"pdf_url":"https://arxiv.org/pdf/2308.16763v1.pdf","comment":"5 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.10811v2","updated":"2023-08-31T14:13:31Z","published":"2023-07-20T16:55:25Z","title":"\"It Felt Like Having a Second Mind\": Investigating Human-AI\n Co-creativity in Prewriting with Large Language Models","summary":" Prewriting is the process of discovering and developing ideas before a first\ndraft, which requires divergent thinking and often implies unstructured\nstrategies such as diagramming, outlining, free-writing, etc. Although large\nlanguage models (LLMs) have been demonstrated to be useful for a variety of\ntasks including creative writing, little is known about how users would\ncollaborate with LLMs to support prewriting. The preferred collaborative role\nand initiative of LLMs during such a creativity process is also unclear. To\ninvestigate human-LLM collaboration patterns and dynamics during prewriting, we\nconducted a three-session qualitative study with 15 participants in two\ncreative tasks: story writing and slogan writing. The findings indicated that\nduring collaborative prewriting, there appears to be a three-stage iterative\nHuman-AI Co-creativity process that includes Ideation, Illumination, and\nImplementation stages. This collaborative process champions the human in a\ndominant role, in addition to mixed and shifting levels of initiative that\nexist between humans and LLMs. This research also reports on collaboration\nbreakdowns that occur during this process, user perceptions of using existing\nLLMs during Human-AI Co-creativity, and discusses design implications to\nsupport this co-creativity process.\n","authors":["Qian Wan","Siying Hu","Yu Zhang","Piaohong Wang","Bo Wen","Zhicong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10811v2.pdf","comment":"Under Review; 25 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.06566v4","updated":"2023-08-31T13:43:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16705v1","updated":"2023-08-31T13:14:47Z","published":"2023-08-31T13:14:47Z","title":"CReHate: Cross-cultural Re-annotation of English Hate Speech Dataset","summary":" English datasets predominantly reflect the perspectives of certain\nnationalities, which can lead to cultural biases in models and datasets. This\nis particularly problematic in tasks heavily influenced by subjectivity, such\nas hate speech detection. To delve into how individuals from different\ncountries perceive hate speech, we introduce CReHate, a cross-cultural\nre-annotation of the sampled SBIC dataset. This dataset includes annotations\nfrom five distinct countries: Australia, Singapore, South Africa, the United\nKingdom, and the United States. Our thorough statistical analysis highlights\nsignificant differences based on nationality, with only 59.4% of the samples\nachieving consensus among all countries. We also introduce a culturally\nsensitive hate speech classifier via transfer learning, adept at capturing\nperspectives of different nationalities. These findings underscore the need to\nre-evaluate certain aspects of NLP research, especially with regard to the\nnuanced nature of hate speech in the English language.\n","authors":["Nayeon Lee","Chani Jung","Junho Myung","Jiho Jin","Juho Kim","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2308.16705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16692v1","updated":"2023-08-31T12:53:09Z","published":"2023-08-31T12:53:09Z","title":"SpeechTokenizer: Unified Speech Tokenizer for Speech Large Language\n Models","summary":" Current speech large language models build upon discrete speech\nrepresentations, which can be categorized into semantic tokens and acoustic\ntokens. However, existing speech tokens are not specifically designed for\nspeech language modeling. To assess the suitability of speech tokens for\nbuilding speech language models, we established the first benchmark,\nSLMTokBench. Our results indicate that neither semantic nor acoustic tokens are\nideal for this purpose. Therefore, we propose SpeechTokenizer, a unified speech\ntokenizer for speech large language models. SpeechTokenizer adopts the\nEncoder-Decoder architecture with residual vector quantization (RVQ). Unifying\nsemantic and acoustic tokens, SpeechTokenizer disentangles different aspects of\nspeech information hierarchically across different RVQ layers. Furthermore, We\nconstruct a Unified Speech Language Model (USLM) leveraging SpeechTokenizer.\nExperiments show that SpeechTokenizer performs comparably to EnCodec in speech\nreconstruction and demonstrates strong performance on the SLMTokBench\nbenchmark. Also, USLM outperforms VALL-E in zero-shot Text-to-Speech tasks.\nCode and models are available at\nhttps://github.com/ZhangXInFD/SpeechTokenizer/.\n","authors":["Xin Zhang","Dong Zhang","Shimin Li","Yaqian Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.16692v1.pdf","comment":"SpeechTokenizer project page is\n https://0nutation.github.io/SpeechTokenizer.github.io/"},{"id":"http://arxiv.org/abs/2308.16688v1","updated":"2023-08-31T12:45:53Z","published":"2023-08-31T12:45:53Z","title":"Using Large Language Models to Automate Category and Trend Analysis of\n Scientific Articles: An Application in Ophthalmology","summary":" Purpose: In this paper, we present an automated method for article\nclassification, leveraging the power of Large Language Models (LLM). The\nprimary focus is on the field of ophthalmology, but the model is extendable to\nother fields. Methods: We have developed a model based on Natural Language\nProcessing (NLP) techniques, including advanced LLMs, to process and analyze\nthe textual content of scientific papers. Specifically, we have employed\nzero-shot learning (ZSL) LLM models and compared against Bidirectional and\nAuto-Regressive Transformers (BART) and its variants, and Bidirectional Encoder\nRepresentations from Transformers (BERT), and its variant such as distilBERT,\nSciBERT, PubmedBERT, BioBERT. Results: The classification results demonstrate\nthe effectiveness of LLMs in categorizing large number of ophthalmology papers\nwithout human intervention. Results: To evalute the LLMs, we compiled a dataset\n(RenD) of 1000 ocular disease-related articles, which were expertly annotated\nby a panel of six specialists into 15 distinct categories. The model achieved\nmean accuracy of 0.86 and mean F1 of 0.85 based on the RenD dataset.\nConclusion: The proposed framework achieves notable improvements in both\naccuracy and efficiency. Its application in the domain of ophthalmology\nshowcases its potential for knowledge organization and retrieval in other\ndomains too. We performed trend analysis that enables the researchers and\nclinicians to easily categorize and retrieve relevant papers, saving time and\neffort in literature review and information gathering as well as identification\nof emerging scientific trends within different disciplines. Moreover, the\nextendibility of the model to other scientific fields broadens its impact in\nfacilitating research and trend analysis across diverse disciplines.\n","authors":["Hina Raja","Asim Munawar","Mohammad Delsoz","Mohammad Elahi","Yeganeh Madadi","Amr Hassan","Hashem Abu Serhan","Onur Inam","Luis Hermandez","Sang Tran","Wuqas Munir","Alaa Abd-Alrazaq","Hao Chen"," SiamakYousefi"],"pdf_url":"https://arxiv.org/pdf/2308.16688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16687v1","updated":"2023-08-31T12:43:18Z","published":"2023-08-31T12:43:18Z","title":"DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew","summary":" We present DictaBERT, a new state-of-the-art pre-trained BERT model for\nmodern Hebrew, outperforming existing models on most benchmarks. Additionally,\nwe release two fine-tuned versions of the model, designed to perform two\nspecific foundational tasks in the analysis of Hebrew texts: prefix\nsegmentation and morphological tagging. These fine-tuned models allow any\ndeveloper to perform prefix segmentation and morphological tagging of a Hebrew\nsentence with a single call to a HuggingFace model, without the need to\nintegrate any additional libraries or code. In this paper we describe the\ndetails of the training as well and the results on the different benchmarks. We\nrelease the models to the community, along with sample code demonstrating their\nuse. We release these models as part of our goal to help further research and\ndevelopment in Hebrew NLP.\n","authors":["Shaltiel Shmidman","Avi Shmidman","Moshe Koppel"],"pdf_url":"https://arxiv.org/pdf/2308.16687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07462v2","updated":"2023-08-31T11:09:16Z","published":"2023-08-14T21:19:44Z","title":"Playing with Words: Comparing the Vocabulary and Lexical Richness of\n ChatGPT and Humans","summary":" The introduction of Artificial Intelligence (AI) generative language models\nsuch as GPT (Generative Pre-trained Transformer) and tools such as ChatGPT has\ntriggered a revolution that can transform how text is generated. This has many\nimplications, for example, as AI-generated text becomes a significant fraction\nof the text, would this have an effect on the language capabilities of readers\nand also on the training of newer AI tools? Would it affect the evolution of\nlanguages? Focusing on one specific aspect of the language: words; will the use\nof tools such as ChatGPT increase or reduce the vocabulary used or the lexical\nrichness? This has implications for words, as those not included in\nAI-generated content will tend to be less and less popular and may eventually\nbe lost. In this work, we perform an initial comparison of the vocabulary and\nlexical richness of ChatGPT and humans when performing the same tasks. In more\ndetail, two datasets containing the answers to different types of questions\nanswered by ChatGPT and humans, and a third dataset in which ChatGPT\nparaphrases sentences and questions are used. The analysis shows that ChatGPT\ntends to use fewer distinct words and lower lexical richness than humans. These\nresults are very preliminary and additional datasets and ChatGPT configurations\nhave to be evaluated to extract more general conclusions. Therefore, further\nresearch is needed to understand how the use of ChatGPT and more broadly\ngenerative AI tools will affect the vocabulary and lexical richness in\ndifferent types of text and languages.\n","authors":["Pedro Reviriego","Javier Conde","Elena Merino-Gómez","Gonzalo Martínez","José Alberto Hernández"],"pdf_url":"https://arxiv.org/pdf/2308.07462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16622v1","updated":"2023-08-31T10:31:19Z","published":"2023-08-31T10:31:19Z","title":"Developing a Scalable Benchmark for Assessing Large Language Models in\n Knowledge Graph Engineering","summary":" As the field of Large Language Models (LLMs) evolves at an accelerated pace,\nthe critical need to assess and monitor their performance emerges. We introduce\na benchmarking framework focused on knowledge graph engineering (KGE)\naccompanied by three challenges addressing syntax and error correction, facts\nextraction and dataset generation. We show that while being a useful tool, LLMs\nare yet unfit to assist in knowledge graph generation with zero-shot prompting.\nConsequently, our LLM-KG-Bench framework provides automatic evaluation and\nstorage of LLM responses as well as statistical data and visualization tools to\nsupport tracking of prompt engineering and model performance.\n","authors":["Lars-Peter Meyer","Johannes Frey","Kurt Junghanns","Felix Brei","Kirill Bulert","Sabine Gründer-Fahrer","Michael Martin"],"pdf_url":"https://arxiv.org/pdf/2308.16622v1.pdf","comment":"To be published in SEMANTICS 2023 poster track proceedings. SEMANTICS\n 2023 EU: 19th International Conference on Semantic Systems, September 20-22,\n 2023, Leipzig, Germany"},{"id":"http://arxiv.org/abs/2308.16593v1","updated":"2023-08-31T09:50:33Z","published":"2023-08-31T09:50:33Z","title":"Towards Spontaneous Style Modeling with Semi-supervised Pre-training for\n Conversational Text-to-Speech Synthesis","summary":" The spontaneous behavior that often occurs in conversations makes speech more\nhuman-like compared to reading-style. However, synthesizing spontaneous-style\nspeech is challenging due to the lack of high-quality spontaneous datasets and\nthe high cost of labeling spontaneous behavior. In this paper, we propose a\nsemi-supervised pre-training method to increase the amount of spontaneous-style\nspeech and spontaneous behavioral labels. In the process of semi-supervised\nlearning, both text and speech information are considered for detecting\nspontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is\nused to model the relationship between each sentence in the conversation.\nExperimental results indicate that our proposed method achieves superior\nexpressive speech synthesis performance with the ability to model spontaneous\nbehavior in spontaneous-style speech and predict reasonable spontaneous\nbehavior from text.\n","authors":["Weiqin Li","Shun Lei","Qiaochu Huang","Yixuan Zhou","Zhiyong Wu","Shiyin Kang","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16593v1.pdf","comment":"Accepted by INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.01458v2","updated":"2023-08-31T09:39:29Z","published":"2023-07-04T03:34:19Z","title":"CARE-MI: Chinese Benchmark for Misinformation Evaluation in Maternity\n and Infant Care","summary":" The recent advances in natural language processing (NLP), have led to a new\ntrend of applying large language models (LLMs) to real-world scenarios. While\nthe latest LLMs are astonishingly fluent when interacting with humans, they\nsuffer from the misinformation problem by unintentionally generating factually\nfalse statements. This can lead to harmful consequences, especially when\nproduced within sensitive contexts, such as healthcare. Yet few previous works\nhave focused on evaluating misinformation in the long-form (LF) generation of\nLLMs, especially for knowledge-intensive topics. Moreover, although LLMs have\nbeen shown to perform well in different languages, misinformation evaluation\nhas been mostly conducted in English. To this end, we present a benchmark,\nCARE-MI, for evaluating LLM misinformation in: 1) a sensitive topic,\nspecifically the maternity and infant care domain; and 2) a language other than\nEnglish, namely Chinese. Most importantly, we provide an innovative paradigm\nfor building LF generation evaluation benchmarks that can be transferred to\nother knowledge-intensive domains and low-resourced languages. Our proposed\nbenchmark fills the gap between the extensive usage of LLMs and the lack of\ndatasets for assessing the misinformation generated by these models. It\ncontains 1,612 expert-checked questions, accompanied with human-selected\nreferences. Using our benchmark, we conduct extensive experiments and found\nthat current Chinese LLMs are far from perfect in the topic of maternity and\ninfant care. In an effort to minimize the reliance on human resources for\nperformance evaluation, we offer off-the-shelf judgment models for\nautomatically assessing the LF output of LLMs given benchmark questions.\nMoreover, we compare potential solutions for LF generation evaluation and\nprovide insights for building better automated metrics.\n","authors":["Tong Xiang","Liangzhi Li","Wangyue Li","Mingbai Bai","Lu Wei","Bowen Wang","Noa Garcia"],"pdf_url":"https://arxiv.org/pdf/2307.01458v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16588v1","updated":"2023-08-31T09:35:52Z","published":"2023-08-31T09:35:52Z","title":"Interpreting Sentiment Composition with Latent Semantic Tree","summary":" As the key to sentiment analysis, sentiment composition considers the\nclassification of a constituent via classifications of its contained\nsub-constituents and rules operated on them. Such compositionality has been\nwidely studied previously in the form of hierarchical trees including untagged\nand sentiment ones, which are intrinsically suboptimal in our view. To address\nthis, we propose semantic tree, a new tree form capable of interpreting the\nsentiment composition in a principled way. Semantic tree is a derivation of a\ncontext-free grammar (CFG) describing the specific composition rules on\ndifference semantic roles, which is designed carefully following previous\nlinguistic conclusions. However, semantic tree is a latent variable since there\nis no its annotation in regular datasets. Thus, in our method, it is\nmarginalized out via inside algorithm and learned to optimize the\nclassification performance. Quantitative and qualitative results demonstrate\nthat our method not only achieves better or competitive results compared to\nbaselines in the setting of regular and domain adaptation classification, and\nalso generates plausible tree explanations.\n","authors":["Zhongtao Jiang","Yuanzhe Zhang","Cao Liu","Jiansong Chen","Jun Zhao","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16588v1.pdf","comment":"Findings of ACL2023"},{"id":"http://arxiv.org/abs/2308.16584v1","updated":"2023-08-31T09:29:35Z","published":"2023-08-31T09:29:35Z","title":"Unsupervised Text Style Transfer with Deep Generative Models","summary":" We present a general framework for unsupervised text style transfer with deep\ngenerative models. The framework models each sentence-label pair in the\nnon-parallel corpus as partially observed from a complete quadruplet which\nadditionally contains two latent codes representing the content and style,\nrespectively. These codes are learned by exploiting dependencies inside the\nobserved data. Then a sentence is transferred by manipulating them. Our\nframework is able to unify previous embedding and prototype methods as two\nspecial forms. It also provides a principled perspective to explain previously\nproposed techniques in the field such as aligned encoder and adversarial\ntraining. We further conduct experiments on three benchmarks. Both automatic\nand human evaluation results show that our methods achieve better or\ncompetitive results compared to several strong baselines.\n","authors":["Zhongtao Jiang","Yuanzhe Zhang","Yiming Ju","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16577v1","updated":"2023-08-31T09:19:15Z","published":"2023-08-31T09:19:15Z","title":"Improving Mandarin Prosodic Structure Prediction with Multi-level\n Contextual Information","summary":" For text-to-speech (TTS) synthesis, prosodic structure prediction (PSP) plays\nan important role in producing natural and intelligible speech. Although\ninter-utterance linguistic information can influence the speech interpretation\nof the target utterance, previous works on PSP mainly focus on utilizing\nintrautterance linguistic information of the current utterance only. This work\nproposes to use inter-utterance linguistic information to improve the\nperformance of PSP. Multi-level contextual information, which includes both\ninter-utterance and intrautterance linguistic information, is extracted by a\nhierarchical encoder from character level, utterance level and discourse level\nof the input text. Then a multi-task learning (MTL) decoder predicts prosodic\nboundaries from multi-level contextual information. Objective evaluation\nresults on two datasets show that our method achieves better F1 scores in\npredicting prosodic word (PW), prosodic phrase (PPH) and intonational phrase\n(IPH). It demonstrates the effectiveness of using multi-level contextual\ninformation for PSP. Subjective preference tests also indicate the naturalness\nof synthesized speeches are improved.\n","authors":["Jie Chen","Changhe Song","Deyi Tuo","Xixin Wu","Shiyin Kang","Zhiyong Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16577v1.pdf","comment":"Accepted by Interspeech2022"},{"id":"http://arxiv.org/abs/2308.10959v2","updated":"2023-08-31T09:14:17Z","published":"2023-08-21T18:14:00Z","title":"DocPrompt: Large-scale continue pretrain for zero-shot and few-shot\n document question answering","summary":" In this paper, we propose Docprompt for document question answering tasks\nwith powerful zero-shot and few-shot performance. We proposed a novel weakly\nsupervised data generation method, a novel multl-stage training method and a\nnovel understanding model \\& generation model ensemble method. We achieved\nstate-of-the-art performance on 4 document question answering tasks. This\nmethod greatly improves the delivery efficiency and model performance of\ndocument question answering customer projects, reducing annotation costs and\nlabor costs. Our demo can be found at\nhttps://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout.\n","authors":["Sijin Wu","Dan Zhang","Teng Hu","Shikun Feng"],"pdf_url":"https://arxiv.org/pdf/2308.10959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13916v2","updated":"2023-08-31T08:53:34Z","published":"2023-08-26T16:51:17Z","title":"Exploring Large Language Models for Knowledge Graph Completion","summary":" Knowledge graphs play a vital role in numerous artificial intelligence tasks,\nyet they frequently face the issue of incompleteness. In this study, we explore\nutilizing Large Language Models (LLM) for knowledge graph completion. We\nconsider triples in knowledge graphs as text sequences and introduce an\ninnovative framework called Knowledge Graph LLM (KG-LLM) to model these\ntriples. Our technique employs entity and relation descriptions of a triple as\nprompts and utilizes the response for predictions. Experiments on various\nbenchmark knowledge graphs demonstrate that our method attains state-of-the-art\nperformance in tasks such as triple classification and relation prediction. We\nalso find that fine-tuning relatively smaller models (e.g., LLaMA-7B,\nChatGLM-6B) outperforms recent ChatGPT and GPT-4.\n","authors":["Liang Yao","Jiazhen Peng","Chengsheng Mao","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2308.13916v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2304.11073v3","updated":"2023-08-31T08:51:33Z","published":"2023-04-20T09:30:50Z","title":"OLISIA: a Cascade System for Spoken Dialogue State Tracking","summary":" Though Dialogue State Tracking (DST) is a core component of spoken dialogue\nsystems, recent work on this task mostly deals with chat corpora, disregarding\nthe discrepancies between spoken and written language.In this paper, we propose\nOLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR)\nmodel and a DST model. We introduce several adaptations in the ASR and DST\nmodules to improve integration and robustness to spoken conversations.With\nthese adaptations, our system ranked first in DSTC11 Track 3, a benchmark to\nevaluate spoken DST. We conduct an in-depth analysis of the results and find\nthat normalizing the ASR outputs and adapting the DST inputs through data\naugmentation, along with increasing the pre-trained models size all play an\nimportant role in reducing the performance discrepancy between written and\nspoken conversations.\n","authors":["Léo Jacqmin","Lucas Druart","Yannick Estève","Benoît Favre","Lina Maria Rojas-Barahona","Valentin Vielzeuf"],"pdf_url":"https://arxiv.org/pdf/2304.11073v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16549v1","updated":"2023-08-31T08:40:41Z","published":"2023-08-31T08:40:41Z","title":"Thesis Distillation: Investigating The Impact of Bias in NLP Models on\n Hate Speech Detection","summary":" This paper is a summary of the work in my PhD thesis. In which, I investigate\nthe impact of bias in NLP models on the task of hate speech detection from\nthree perspectives: explainability, offensive stereotyping bias, and fairness.\nI discuss the main takeaways from my thesis and how they can benefit the\nbroader NLP community. Finally, I discuss important future research directions.\nThe findings of my thesis suggest that bias in NLP models impacts the task of\nhate speech detection from all three perspectives. And that unless we start\nincorporating social sciences in studying bias in NLP models, we will not\neffectively overcome the current limitations of measuring and mitigating bias\nin NLP models.\n","authors":["Fatma Elsafoury"],"pdf_url":"https://arxiv.org/pdf/2308.16549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16540v1","updated":"2023-08-31T08:30:20Z","published":"2023-08-31T08:30:20Z","title":"Time-Varying Quasi-Closed-Phase Analysis for Accurate Formant Tracking\n in Speech Signals","summary":" In this paper, we propose a new method for the accurate estimation and\ntracking of formants in speech signals using time-varying quasi-closed-phase\n(TVQCP) analysis. Conventional formant tracking methods typically adopt a\ntwo-stage estimate-and-track strategy wherein an initial set of formant\ncandidates are estimated using short-time analysis (e.g., 10--50 ms), followed\nby a tracking stage based on dynamic programming or a linear state-space model.\nOne of the main disadvantages of these approaches is that the tracking stage,\nhowever good it may be, cannot improve upon the formant estimation accuracy of\nthe first stage. The proposed TVQCP method provides a single-stage formant\ntracking that combines the estimation and tracking stages into one. TVQCP\nanalysis combines three approaches to improve formant estimation and tracking:\n(1) it uses temporally weighted quasi-closed-phase analysis to derive\nclosed-phase estimates of the vocal tract with reduced interference from the\nexcitation source, (2) it increases the residual sparsity by using the $L_1$\noptimization and (3) it uses time-varying linear prediction analysis over long\ntime windows (e.g., 100--200 ms) to impose a continuity constraint on the vocal\ntract model and hence on the formant trajectories. Formant tracking experiments\nwith a wide variety of synthetic and natural speech signals show that the\nproposed TVQCP method performs better than conventional and popular formant\ntracking tools, such as Wavesurfer and Praat (based on dynamic programming),\nthe KARMA algorithm (based on Kalman filtering), and DeepFormants (based on\ndeep neural networks trained in a supervised manner). Matlab scripts for the\nproposed method can be found at: https://github.com/njaygowda/ftrack\n","authors":["Dhananjaya Gowda","Sudarsana Reddy Kadiri","Brad Story","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2308.16540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16537v1","updated":"2023-08-31T08:28:11Z","published":"2023-08-31T08:28:11Z","title":"The Smart Data Extractor, a Clinician Friendly Solution to Accelerate\n and Improve the Data Collection During Clinical Trials","summary":" In medical research, the traditional way to collect data, i.e. browsing\npatient files, has been proven to induce bias, errors, human labor and costs.\nWe propose a semi-automated system able to extract every type of data,\nincluding notes. The Smart Data Extractor pre-populates clinic research forms\nby following rules. We performed a cross-testing experiment to compare\nsemi-automated to manual data collection. 20 target items had to be collected\nfor 79 patients. The average time to complete one form was 6'81'' for manual\ndata collection and 3'22'' with the Smart Data Extractor. There were also more\nmistakes during manual data collection (163 for the whole cohort) than with the\nSmart Data Extractor (46 for the whole cohort). We present an easy to use,\nunderstandable and agile solution to fill out clinical research forms. It\nreduces human effort and provides higher quality data, avoiding data re-entry\nand fatigue induced errors.\n","authors":["Sophie Quennelle","Maxime Douillet","Lisa Friedlander","Olivia Boyer","Anita Burgun","Antoine Neuraz","Nicolas Garcelon"],"pdf_url":"https://arxiv.org/pdf/2308.16537v1.pdf","comment":"IOS Press, 2023, Studies in Health Technology and Informatics"},{"id":"http://arxiv.org/abs/2308.16498v1","updated":"2023-08-31T07:00:21Z","published":"2023-08-31T07:00:21Z","title":"Generalised Winograd Schema and its Contextuality","summary":" Ambiguities in natural language give rise to probability distributions over\ninterpretations. The distributions are often over multiple ambiguous words at a\ntime; a multiplicity which makes them a suitable topic for sheaf-theoretic\nmodels of quantum contextuality. Previous research showed that different\nquantitative measures of contextuality correlate well with Psycholinguistic\nresearch on lexical ambiguities. In this work, we focus on coreference\nambiguities and investigate the Winograd Schema Challenge (WSC), a test\nproposed by Levesque in 2011 to evaluate the intelligence of machines. The WSC\nconsists of a collection of multiple-choice questions that require\ndisambiguating pronouns in sentences structured according to the Winograd\nschema, in a way that makes it difficult for machines to determine the correct\nreferents but remains intuitive for human comprehension. In this study, we\npropose an approach that analogously models the Winograd schema as an\nexperiment in quantum physics. However, we argue that the original Winograd\nSchema is inherently too simplistic to facilitate contextuality. We introduce a\nnovel mechanism for generalising the schema, rendering it analogous to a\nBell-CHSH measurement scenario. We report an instance of this generalised\nschema, complemented by the human judgements we gathered via a crowdsourcing\nplatform. The resulting model violates the Bell-CHSH inequality by 0.192, thus\nexhibiting contextuality in a coreference resolution setting.\n","authors":["Kin Ian Lo","Mehrnoosh Sadrzadeh","Shane Mansfield"],"pdf_url":"https://arxiv.org/pdf/2308.16498v1.pdf","comment":"In Proceedings QPL 2023, arXiv:2308.15489"},{"id":"http://arxiv.org/abs/2308.16475v1","updated":"2023-08-31T05:40:14Z","published":"2023-08-31T05:40:14Z","title":"Transformer Compression via Subspace Projection","summary":" We propose TCSP, a novel method for compressing a transformer model by\nfocusing on reducing the hidden size of the model. By projecting the whole\ntransform model into a subspace, we enable matrix operations between the weight\nmatrices in the model and features in a reduced-dimensional space, leading to\nsignificant reductions in model parameters and computing resources. To\nestablish this subspace, we decompose the feature matrix, derived from\ndifferent layers of sampled data instances, into a projection matrix. For\nevaluation, TCSP is applied to compress T5 and BERT models on the GLUE and\nSQuAD benchmarks. Experimental results demonstrate that TCSP achieves a\ncompression ratio of 44\\% with at most 1.6\\% degradation in accuracy,\nsurpassing or matching prior compression methods. Furthermore, TCSP exhibits\ncompatibility with other methods targeting filter and attention head size\ncompression.\n","authors":["Yuxuan Hu","Jing Zhang","Chen Zhao","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.16475v1.pdf","comment":"21 pages, 1 figures"},{"id":"http://arxiv.org/abs/2308.16474v1","updated":"2023-08-31T05:37:21Z","published":"2023-08-31T05:37:21Z","title":"Enhancing Subtask Performance of Multi-modal Large Language Model","summary":" Multi-modal Large Language Model (MLLM) refers to a model expanded from a\nLarge Language Model (LLM) that possesses the capability to handle and infer\nmulti-modal data. Current MLLMs typically begin by using LLMs to decompose\ntasks into multiple subtasks, then employing individual pre-trained models to\ncomplete specific subtasks, and ultimately utilizing LLMs to integrate the\nresults of each subtasks to obtain the results of the task. In real-world\nscenarios, when dealing with large projects, it is common practice to break\ndown the project into smaller sub-projects, with different teams providing\ncorresponding solutions or results. The project owner then decides which\nsolution or result to use, ensuring the best possible outcome for each subtask\nand, consequently, for the entire project. Inspired by this, this study\nconsiders selecting multiple pre-trained models to complete the same subtask.\nBy combining the results from multiple pre-trained models, the optimal subtask\nresult is obtained, enhancing the performance of the MLLM. Specifically, this\nstudy first selects multiple pre-trained models focused on the same subtask\nbased on distinct evaluation approaches, and then invokes these models in\nparallel to process input data and generate corresponding subtask results.\nFinally, the results from multiple pre-trained models for the same subtask are\ncompared using the LLM, and the best result is chosen as the outcome for that\nsubtask. Extensive experiments are conducted in this study using GPT-4\nannotated datasets and human-annotated datasets. The results of various\nevaluation metrics adequately demonstrate the effectiveness of the proposed\napproach in this paper.\n","authors":["Yongqiang Zhao","Zhenyu Li","Feng Zhang","Xinhai Xu","Donghong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16469v1","updated":"2023-08-31T05:25:04Z","published":"2023-08-31T05:25:04Z","title":"Link Prediction for Wikipedia Articles as a Natural Language Inference\n Task","summary":" Link prediction task is vital to automatically understanding the structure of\nlarge knowledge bases. In this paper, we present our system to solve this task\nat the Data Science and Advanced Analytics 2023 Competition \"Efficient and\nEffective Link Prediction\" (DSAA-2023 Competition) with a corpus containing\n948,233 training and 238,265 for public testing. This paper introduces an\napproach to link prediction in Wikipedia articles by formulating it as a\nnatural language inference (NLI) task. Drawing inspiration from recent\nadvancements in natural language processing and understanding, we cast link\nprediction as an NLI task, wherein the presence of a link between two articles\nis treated as a premise, and the task is to determine whether this premise\nholds based on the information presented in the articles. We implemented our\nsystem based on the Sentence Pair Classification for Link Prediction for the\nWikipedia Articles task. Our system achieved 0.99996 Macro F1-score and 1.00000\nMacro F1-score for the public and private test sets, respectively. Our team\nUIT-NLP ranked 3rd in performance on the private test set, equal to the scores\nof the first and second places. Our code is publicly for research purposes.\n","authors":["Chau-Thang Phan","Quoc-Nam Nguyen","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.16469v1.pdf","comment":"Accepted at the 10th IEEE International Conference On Data Science\n And Advanced Analytics (DSAA 2023)"},{"id":"http://arxiv.org/abs/2308.16463v1","updated":"2023-08-31T05:15:27Z","published":"2023-08-31T05:15:27Z","title":"Sparkles: Unlocking Chats Across Multiple Images for Multimodal\n Instruction-Following Models","summary":" Large language models exhibit enhanced zero-shot performance on various tasks\nwhen fine-tuned with instruction-following data. Multimodal\ninstruction-following models extend these capabilities by integrating both text\nand images. However, existing models such as MiniGPT-4 face challenges in\nmaintaining dialogue coherence in scenarios involving multiple images. A\nprimary reason is the lack of a specialized dataset for this critical\napplication. To bridge these gaps, we present SparklesChat, a multimodal\ninstruction-following model for open-ended dialogues across multiple images. To\nsupport the training, we introduce SparklesDialogue, the first\nmachine-generated dialogue dataset tailored for word-level interleaved\nmulti-image and text interactions. Furthermore, we construct SparklesEval, a\nGPT-assisted benchmark for quantitatively assessing a model's conversational\ncompetence across multiple images and dialogue turns. Our experiments validate\nthe effectiveness of SparklesChat in understanding and reasoning across\nmultiple images and dialogue turns. Specifically, SparklesChat outperformed\nMiniGPT-4 on established vision-and-language benchmarks, including the BISON\nbinary image selection task and the NLVR2 visual reasoning task. Moreover,\nSparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding\nMiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative\nevaluations further demonstrate SparklesChat's generality in handling\nreal-world applications. All resources will be available at\nhttps://github.com/HYPJUDY/Sparkles.\n","authors":["Yupan Huang","Zaiqiao Meng","Fangyu Liu","Yixuan Su","Nigel Collier","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16458v1","updated":"2023-08-31T04:52:58Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained language models like ChatGPT have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks. Moreover, in bioinformatics, generating\nfunctional programs poses additional notable challenges due to the amount of\ndomain knowledge, the need for complicated data operations, and intricate\nfunctional dependencies between the operations. Here, we present BioCoder, a\nbenchmark developed to evaluate existing pre-trained models in generating\nbioinformatics code. In relation to function-code generation, BioCoder covers\npotential package dependencies, class declarations, and global variables. It\nincorporates 1026 functions and 1243 methods in Python and Java from GitHub and\n253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing\nframework for evaluation, and we have applied it to evaluate many models\nincluding InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+,\nInstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes\nthe importance of domain knowledge, pragmatic code generation, and contextual\nunderstanding. Our dataset, benchmark, Docker images, and scripts required for\ntesting are all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06345v2","updated":"2023-08-31T03:14:47Z","published":"2023-06-10T05:24:29Z","title":"Improving Non-autoregressive Translation Quality with Pretrained\n Language Model, Embedding Distillation and Upsampling Strategy for CTC","summary":" Non-autoregressive approaches aim to improve the inference speed of\ntranslation models, particularly those that generate output in a one-pass\nforward manner. However, these approaches often suffer from a significant drop\nin translation quality compared to autoregressive models. This paper introduces\na series of innovative techniques to enhance the translation quality of\nNon-Autoregressive Translation (NAT) models while maintaining a substantial\nacceleration in inference speed. We propose fine-tuning Pretrained Multilingual\nLanguage Models (PMLMs) with the CTC loss to train NAT models effectively.\nFurthermore, we adopt the MASK insertion scheme for up-sampling instead of\ntoken duplication, and we present an embedding distillation method to further\nenhance performance. In our experiments, our model outperforms the baseline\nautoregressive model (Transformer \\textit{base}) on multiple datasets,\nincluding WMT'14 DE$\\leftrightarrow$EN, WMT'16 RO$\\leftrightarrow$EN, and\nIWSLT'14 DE$\\leftrightarrow$EN. Notably, our model achieves better performance\nthan the baseline autoregressive model on the IWSLT'14 En$\\leftrightarrow$De\nand WMT'16 En$\\leftrightarrow$Ro datasets, even without using distillation data\nduring training. It is worth highlighting that on the IWSLT'14\nDE$\\rightarrow$EN dataset, our model achieves an impressive BLEU score of\n39.59, setting a new state-of-the-art performance. Additionally, our model\nexhibits a remarkable speed improvement of 16.35 times compared to the\nautoregressive model.\n","authors":["Shen-sian Syu","Juncheng Xie","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2306.06345v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.16415v1","updated":"2023-08-31T02:58:33Z","published":"2023-08-31T02:58:33Z","title":"Knowledge Distillation from Non-streaming to Streaming ASR Encoder using\n Auxiliary Non-streaming Layer","summary":" Streaming automatic speech recognition (ASR) models are restricted from\naccessing future context, which results in worse performance compared to the\nnon-streaming models. To improve the performance of streaming ASR, knowledge\ndistillation (KD) from the non-streaming to streaming model has been studied,\nmainly focusing on aligning the output token probabilities. In this paper, we\npropose a layer-to-layer KD from the teacher encoder to the student encoder. To\nensure that features are extracted using the same context, we insert auxiliary\nnon-streaming branches to the student and perform KD from the non-streaming\nteacher layer to the non-streaming auxiliary layer. We design a special KD loss\nthat leverages the autoregressive predictive coding (APC) mechanism to\nencourage the streaming model to predict unseen future contexts. Experimental\nresults show that the proposed method can significantly reduce the word error\nrate compared to previous token probability distillation methods.\n","authors":["Kyuhong Shim","Jinkyu Lee","Simyung Chang","Kyuwoong Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.16415v1.pdf","comment":"Accepted to Interspeech 2023"},{"id":"http://arxiv.org/abs/2306.15245v2","updated":"2023-08-31T02:50:25Z","published":"2023-06-27T06:58:03Z","title":"C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue\n Evaluation","summary":" Existing reference-free turn-level evaluation metrics for chatbots\ninadequately capture the interaction between the user and the system.\nConsequently, they often correlate poorly with human evaluations. To address\nthis issue, we propose a novel model-agnostic approach that leverages\nConditional Pointwise Mutual Information (C-PMI) to measure the turn-level\ninteraction between the system and the user based on a given evaluation\ndimension. Experimental results on the widely used FED dialogue evaluation\ndataset demonstrate that our approach significantly improves the correlation\nwith human judgment compared with existing evaluation systems. By replacing the\nnegative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve\na relative 60.5% higher Spearman correlation on average for the FED evaluation\nmetric. Our code is publicly available at https://github.com/renll/C-PMI.\n","authors":["Liliang Ren","Mankeerat Sidhu","Qi Zeng","Revanth Gangi Reddy","Heng Ji","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2306.15245v2.pdf","comment":"Published at ACL2023 DialDoc Workshop; Updated Results"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.16911v1","updated":"2023-08-31T17:59:46Z","published":"2023-08-31T17:59:46Z","title":"PointLLM: Empowering Large Language Models to Understand Point Clouds","summary":" The unprecedented advancements in Large Language Models (LLMs) have created a\nprofound impact on natural language processing but are yet to fully embrace the\nrealm of 3D understanding. This paper introduces PointLLM, a preliminary effort\nto fill this gap, thereby enabling LLMs to understand point clouds and offering\na new avenue beyond 2D visual data. PointLLM processes colored object point\nclouds with human instructions and generates contextually appropriate\nresponses, illustrating its grasp of point clouds and common sense.\nSpecifically, it leverages a point cloud encoder with a powerful LLM to\neffectively fuse geometric, appearance, and linguistic information. We collect\na novel dataset comprising 660K simple and 70K complex point-text instruction\npairs to enable a two-stage training strategy: initially aligning latent spaces\nand subsequently instruction-tuning the unified model. To rigorously evaluate\nour model's perceptual abilities and its generalization capabilities, we\nestablish two benchmarks: Generative 3D Object Classification and 3D Object\nCaptioning, assessed through three different methods, including human\nevaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment\nresults show that PointLLM demonstrates superior performance over existing 2D\nbaselines. Remarkably, in human-evaluated object captioning tasks, PointLLM\noutperforms human annotators in over 50% of the samples. Codes, datasets, and\nbenchmarks are available at https://github.com/OpenRobotLab/PointLLM .\n","authors":["Runsen Xu","Xiaolong Wang","Tai Wang","Yilun Chen","Jiangmiao Pang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.16911v1.pdf","comment":"19 pages. Empowering large language models with 3D point cloud\n understanding, accompanied by a novel dataset and carefully designed\n benchmarks. Project page: https://runsenxu.com/projects/PointLLM"},{"id":"http://arxiv.org/abs/2308.16909v1","updated":"2023-08-31T17:59:33Z","published":"2023-08-31T17:59:33Z","title":"StyleInV: A Temporal Style Modulated Inversion Network for Unconditional\n Video Generation","summary":" Unconditional video generation is a challenging task that involves\nsynthesizing high-quality videos that are both coherent and of extended\nduration. To address this challenge, researchers have used pretrained StyleGAN\nimage generators for high-quality frame synthesis and focused on motion\ngenerator design. The motion generator is trained in an autoregressive manner\nusing heavy 3D convolutional discriminators to ensure motion coherence during\nvideo generation. In this paper, we introduce a novel motion generator design\nthat uses a learning-based inversion network for GAN. The encoder in our method\ncaptures rich and smooth priors from encoding images to latents, and given the\nlatent of an initially generated frame as guidance, our method can generate\nsmooth future latent by modulating the inversion encoder temporally. Our method\nenjoys the advantage of sparse training and naturally constrains the generation\nspace of our motion generator with the inversion network guided by the initial\nframe, eliminating the need for heavy discriminators. Moreover, our method\nsupports style transfer with simple fine-tuning when the encoder is paired with\na pretrained StyleGAN generator. Extensive experiments conducted on various\nbenchmarks demonstrate the superiority of our method in generating long and\nhigh-resolution videos with decent single-frame quality and temporal\nconsistency.\n","authors":["Yuhan Wang","Liming Jiang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2308.16909v1.pdf","comment":"ICCV 2023. Code: https://github.com/johannwyh/StyleInV Project page:\n https://www.mmlab-ntu.com/project/styleinv/index.html"},{"id":"http://arxiv.org/abs/2308.16906v1","updated":"2023-08-31T17:59:24Z","published":"2023-08-31T17:59:24Z","title":"Fine-Grained Cross-View Geo-Localization Using a Correlation-Aware\n Homography Estimator","summary":" In this paper, we introduce a novel approach to fine-grained cross-view\ngeo-localization. Our method aligns a warped ground image with a corresponding\nGPS-tagged satellite image covering the same area using homography estimation.\nWe first employ a differentiable spherical transform, adhering to geometric\nprinciples, to accurately align the perspective of the ground image with the\nsatellite map. This transformation effectively places ground and aerial images\nin the same view and on the same plane, reducing the task to an image alignment\nproblem. To address challenges such as occlusion, small overlapping range, and\nseasonal variations, we propose a robust correlation-aware homography estimator\nto align similar parts of the transformed ground image with the satellite\nimage. Our method achieves sub-pixel resolution and meter-level GPS accuracy by\nmapping the center point of the transformed ground image to the satellite image\nusing a homography matrix and determining the orientation of the ground camera\nusing a point above the central axis. Operating at a speed of 30 FPS, our\nmethod outperforms state-of-the-art techniques, reducing the mean metric\nlocalization error by 21.3% and 32.4% in same-area and cross-area\ngeneralization tasks on the VIGOR benchmark, respectively, and by 34.4% on the\nKITTI benchmark in same-area evaluation.\n","authors":["Xiaolong Wang","Runsen Xu","Zuofan Cui","Zeyu Wan","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16906v1.pdf","comment":"19 pages. Reducing the cross-view geo-localization problem to a 2D\n image alignment problem by utilizing BEV transformation, and completing the\n alignment process with a correlation-aware homography estimator. Code:\n https://github.com/xlwangDev/HC-Net"},{"id":"http://arxiv.org/abs/2308.16905v1","updated":"2023-08-31T17:59:08Z","published":"2023-08-31T17:59:08Z","title":"InterDiff: Generating 3D Human-Object Interactions with Physics-Informed\n Diffusion","summary":" This paper addresses a novel task of anticipating 3D human-object\ninteractions (HOIs). Most existing research on HOI synthesis lacks\ncomprehensive whole-body interactions with dynamic objects, e.g., often limited\nto manipulating small or static objects. Our task is significantly more\nchallenging, as it requires modeling dynamic objects with various shapes,\ncapturing whole-body motion, and ensuring physically valid interactions. To\nthis end, we propose InterDiff, a framework comprising two key steps: (i)\ninteraction diffusion, where we leverage a diffusion model to encode the\ndistribution of future human-object interactions; (ii) interaction correction,\nwhere we introduce a physics-informed predictor to correct denoised HOIs in a\ndiffusion step. Our key insight is to inject prior knowledge that the\ninteractions under reference with respect to contact points follow a simple\npattern and are easily predictable. Experiments on multiple human-object\ninteraction datasets demonstrate the effectiveness of our method for this task,\ncapable of producing realistic, vivid, and remarkably long-term 3D HOI\npredictions.\n","authors":["Sirui Xu","Zhengyuan Li","Yu-Xiong Wang","Liang-Yan Gui"],"pdf_url":"https://arxiv.org/pdf/2308.16905v1.pdf","comment":"ICCV 2023; Project Page: https://sirui-xu.github.io/InterDiff/"},{"id":"http://arxiv.org/abs/2303.12059v3","updated":"2023-08-31T17:58:46Z","published":"2023-03-21T17:51:23Z","title":"Motion Matters: Neural Motion Transfer for Better Camera Physiological\n Measurement","summary":" Machine learning models for camera-based physiological measurement can have\nweak generalization due to a lack of representative training data. Body motion\nis one of the most significant sources of noise when attempting to recover the\nsubtle cardiac pulse from a video. We explore motion transfer as a form of data\naugmentation to introduce motion variation while preserving physiological\nchanges of interest. We adapt a neural video synthesis approach to augment\nvideos for the task of remote photoplethysmography (rPPG) and study the effects\nof motion augmentation with respect to 1) the magnitude and 2) the type of\nmotion. After training on motion-augmented versions of publicly available\ndatasets, we demonstrate a 47% improvement over existing inter-dataset results\nusing various state-of-the-art methods on the PURE dataset. We also present\ninter-dataset results on five benchmark datasets to show improvements of up to\n79% using TS-CAN, a neural rPPG estimation method. Our findings illustrate the\nusefulness of motion transfer as a data augmentation technique for improving\nthe generalization of models for camera-based physiological sensing. We release\nour code for using motion transfer as a data augmentation technique on three\npublicly available datasets, UBFC-rPPG, PURE, and SCAMPS, and models\npre-trained on motion-augmented data here: https://motion-matters.github.io/\n","authors":["Akshay Paruchuri","Xin Liu","Yulu Pan","Shwetak Patel","Daniel McDuff","Soumyadip Sengupta"],"pdf_url":"https://arxiv.org/pdf/2303.12059v3.pdf","comment":"17 pages, 6 figures, 15 tables"},{"id":"http://arxiv.org/abs/2308.16896v1","updated":"2023-08-31T17:57:17Z","published":"2023-08-31T17:57:17Z","title":"PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic\n Occupancy Prediction","summary":" Semantic segmentation in autonomous driving has been undergoing an evolution\nfrom sparse point segmentation to dense voxel segmentation, where the objective\nis to predict the semantic occupancy of each voxel in the concerned 3D space.\nThe dense nature of the prediction space has rendered existing efficient\n2D-projection-based methods (e.g., bird's eye view, range view, etc.)\nineffective, as they can only describe a subspace of the 3D scene. To address\nthis, we propose a cylindrical tri-perspective view to represent point clouds\neffectively and comprehensively and a PointOcc model to process them\nefficiently. Considering the distance distribution of LiDAR point clouds, we\nconstruct the tri-perspective view in the cylindrical coordinate system for\nmore fine-grained modeling of nearer areas. We employ spatial group pooling to\nmaintain structural details during projection and adopt 2D backbones to\nefficiently process each TPV plane. Finally, we obtain the features of each\npoint by aggregating its projected features on each of the processed TPV planes\nwithout the need for any post-processing. Extensive experiments on both 3D\noccupancy prediction and LiDAR segmentation benchmarks demonstrate that the\nproposed PointOcc achieves state-of-the-art performance with much faster speed.\nSpecifically, despite only using LiDAR, PointOcc significantly outperforms all\nother methods, including multi-modal methods, with a large margin on the\nOpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc.\n","authors":["Sicheng Zuo","Wenzhao Zheng","Yuanhui Huang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16896v1.pdf","comment":"Code is available at https://github.com/wzzheng/PointOcc"},{"id":"http://arxiv.org/abs/2308.16894v1","updated":"2023-08-31T17:56:19Z","published":"2023-08-31T17:56:19Z","title":"EMDB: The Electromagnetic Database of Global 3D Human Pose and Shape in\n the Wild","summary":" We present EMDB, the Electromagnetic Database of Global 3D Human Pose and\nShape in the Wild. EMDB is a novel dataset that contains high-quality 3D SMPL\npose and shape parameters with global body and camera trajectories for\nin-the-wild videos. We use body-worn, wireless electromagnetic (EM) sensors and\na hand-held iPhone to record a total of 58 minutes of motion data, distributed\nover 81 indoor and outdoor sequences and 10 participants. Together with\naccurate body poses and shapes, we also provide global camera poses and body\nroot trajectories. To construct EMDB, we propose a multi-stage optimization\nprocedure, which first fits SMPL to the 6-DoF EM measurements and then refines\nthe poses via image observations. To achieve high-quality results, we leverage\na neural implicit avatar model to reconstruct detailed human surface geometry\nand appearance, which allows for improved alignment and smoothness via a dense\npixel-level objective. Our evaluations, conducted with a multi-view volumetric\ncapture system, indicate that EMDB has an expected accuracy of 2.3 cm\npositional and 10.6 degrees angular error, surpassing the accuracy of previous\nin-the-wild datasets. We evaluate existing state-of-the-art monocular RGB\nmethods for camera-relative and global pose estimation on EMDB. EMDB is\npublicly available under https://ait.ethz.ch/emdb\n","authors":["Manuel Kaufmann","Jie Song","Chen Guo","Kaiyue Shen","Tianjian Jiang","Chengcheng Tang","Juan Zarate","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2308.16894v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16893v1","updated":"2023-08-31T17:56:13Z","published":"2023-08-31T17:56:13Z","title":"Language-Conditioned Path Planning","summary":" Contact is at the core of robotic manipulation. At times, it is desired (e.g.\nmanipulation and grasping), and at times, it is harmful (e.g. when avoiding\nobstacles). However, traditional path planning algorithms focus solely on\ncollision-free paths, limiting their applicability in contact-rich tasks. To\naddress this limitation, we propose the domain of Language-Conditioned Path\nPlanning, where contact-awareness is incorporated into the path planning\nproblem. As a first step in this domain, we propose Language-Conditioned\nCollision Functions (LACO) a novel approach that learns a collision function\nusing only a single-view image, language prompt, and robot configuration. LACO\npredicts collisions between the robot and the environment, enabling flexible,\nconditional path planning without the need for manual object annotations, point\ncloud data, or ground-truth object meshes. In both simulation and the real\nworld, we demonstrate that LACO can facilitate complex, nuanced path plans that\nallow for interaction with objects that are safe to collide, rather than\nprohibiting any collision.\n","authors":["Amber Xie","Youngwoon Lee","Pieter Abbeel","Stephen James"],"pdf_url":"https://arxiv.org/pdf/2308.16893v1.pdf","comment":"Conference on Robot Learning, 2023"},{"id":"http://arxiv.org/abs/2308.16891v1","updated":"2023-08-31T17:52:10Z","published":"2023-08-31T17:52:10Z","title":"GNFactor: Multi-Task Real Robot Learning with Generalizable Neural\n Feature Fields","summary":" It is a long-standing problem in robotics to develop agents capable of\nexecuting diverse manipulation tasks from visual observations in unstructured\nreal-world environments. To achieve this goal, the robot needs to have a\ncomprehensive understanding of the 3D structure and semantics of the scene. In\nthis work, we present $\\textbf{GNFactor}$, a visual behavior cloning agent for\nmulti-task robotic manipulation with $\\textbf{G}$eneralizable $\\textbf{N}$eural\nfeature $\\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural\nfield (GNF) as a reconstruction module and a Perceiver Transformer as a\ndecision-making module, leveraging a shared deep 3D voxel representation. To\nincorporate semantics in 3D, the reconstruction module utilizes a\nvision-language foundation model ($\\textit{e.g.}$, Stable Diffusion) to distill\nrich semantic information into the deep 3D voxel. We evaluate GNFactor on 3\nreal robot tasks and perform detailed ablations on 10 RLBench tasks with a\nlimited number of demonstrations. We observe a substantial improvement of\nGNFactor over current state-of-the-art methods in seen and unseen tasks,\ndemonstrating the strong generalization ability of GNFactor. Our project\nwebsite is https://yanjieze.com/GNFactor/ .\n","authors":["Yanjie Ze","Ge Yan","Yueh-Hua Wu","Annabella Macaluso","Yuying Ge","Jianglong Ye","Nicklas Hansen","Li Erran Li","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16891v1.pdf","comment":"CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/"},{"id":"http://arxiv.org/abs/2308.16890v1","updated":"2023-08-31T17:52:04Z","published":"2023-08-31T17:52:04Z","title":"TouchStone: Evaluating Vision-Language Models by Language Models","summary":" Large vision-language models (LVLMs) have recently witnessed rapid\nadvancements, exhibiting a remarkable capacity for perceiving, understanding,\nand processing visual information by connecting visual receptor with large\nlanguage models (LLMs). However, current assessments mainly focus on\nrecognizing and reasoning abilities, lacking direct evaluation of\nconversational skills and neglecting visual storytelling abilities. In this\npaper, we propose an evaluation method that uses strong LLMs as judges to\ncomprehensively evaluate the various abilities of LVLMs. Firstly, we construct\na comprehensive visual dialogue dataset TouchStone, consisting of open-world\nimages and questions, covering five major categories of abilities and 27\nsubtasks. This dataset not only covers fundamental recognition and\ncomprehension but also extends to literary creation. Secondly, by integrating\ndetailed image annotations we effectively transform the multimodal input\ncontent into a form understandable by LLMs. This enables us to employ advanced\nLLMs for directly evaluating the quality of the multimodal dialogue without\nrequiring human intervention. Through validation, we demonstrate that powerful\nLVLMs, such as GPT-4, can effectively score dialogue quality by leveraging\ntheir textual capabilities alone, aligning with human preferences. We hope our\nwork can serve as a touchstone for LVLMs' evaluation and pave the way for\nbuilding stronger LVLMs. The evaluation code is available at\nhttps://github.com/OFA-Sys/TouchStone.\n","authors":["Shuai Bai","Shusheng Yang","Jinze Bai","Peng Wang","Xingxuan Zhang","Junyang Lin","Xinggang Wang","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16890v1.pdf","comment":"https://github.com/OFA-Sys/TouchStone"},{"id":"http://arxiv.org/abs/2212.02611v2","updated":"2023-08-31T17:51:08Z","published":"2022-12-05T21:52:12Z","title":"StyleGAN as a Utility-Preserving Face De-identification Method","summary":" Face de-identification methods have been proposed to preserve users' privacy\nby obscuring their faces. These methods, however, can degrade the quality of\nphotos, and they usually do not preserve the utility of faces, i.e., their age,\ngender, pose, and facial expression. Recently, GANs, such as StyleGAN, have\nbeen proposed, which generate realistic, high-quality imaginary faces. In this\npaper, we investigate the use of StyleGAN in generating de-identified faces\nthrough style mixing. We examined this de-identification method for preserving\nutility and privacy by implementing several face detection, verification, and\nidentification attacks and conducting a user study. The results from our\nextensive experiments, human evaluation, and comparison with two\nstate-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN\nperforms on par or better than these methods, preserving users' privacy and\nimages' utility. In particular, the results of the machine learning-based\nexperiments show that StyleGAN0-4 preserves utility better than CIAGAN and\nDeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves\nutility at the same level while providing more privacy. In this paper, for the\nfirst time, we also performed a carefully designed user study to examine both\nprivacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well\nas CIAGAN and DeepPrivacy from the human observers' perspectives. Our\nstatistical tests showed that participants tend to verify and identify\nStyleGAN0-5 images more easily than DeepPrivacy images. All the methods but\nStyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding\nutility, as expected, StyleGAN0-5 performed significantly better in preserving\nsome attributes. Among all methods, on average, participants believe gender has\nbeen preserved the most while naturalness has been preserved the least.\n","authors":["Seyyed Mohammad Sadegh Moosavi Khorzooghi","Shirin Nilizadeh"],"pdf_url":"https://arxiv.org/pdf/2212.02611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16880v1","updated":"2023-08-31T17:37:23Z","published":"2023-08-31T17:37:23Z","title":"Text2Scene: Text-driven Indoor Scene Stylization with Part-aware Details","summary":" We propose Text2Scene, a method to automatically create realistic textures\nfor virtual scenes composed of multiple objects. Guided by a reference image\nand text descriptions, our pipeline adds detailed texture on labeled 3D\ngeometries in the room such that the generated colors respect the hierarchical\nstructure or semantic parts that are often composed of similar materials.\nInstead of applying flat stylization on the entire scene at a single step, we\nobtain weak semantic cues from geometric segmentation, which are further\nclarified by assigning initial colors to segmented parts. Then we add texture\ndetails for individual objects such that their projections on image space\nexhibit feature embedding aligned with the embedding of the input. The\ndecomposition makes the entire pipeline tractable to a moderate amount of\ncomputation resources and memory. As our framework utilizes the existing\nresources of image and text embedding, it does not require dedicated datasets\nwith high-quality textures designed by skillful artists. To the best of our\nknowledge, it is the first practical and scalable approach that can create\ndetailed and realistic textures of the desired style that maintain structural\ncontext for scenes with multiple objects.\n","authors":["Inwoo Hwang","Hyeonwoo Kim","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2308.16880v1.pdf","comment":"Accepted to CVPR 2023"},{"id":"http://arxiv.org/abs/2308.16876v1","updated":"2023-08-31T17:23:50Z","published":"2023-08-31T17:23:50Z","title":"SportsSloMo: A New Benchmark and Baselines for Human-centric Video Frame\n Interpolation","summary":" Human-centric video frame interpolation has great potential for improving\npeople's entertainment experiences and finding commercial applications in the\nsports analysis industry, e.g., synthesizing slow-motion videos. Although there\nare multiple benchmark datasets available in the community, none of them is\ndedicated for human-centric scenarios. To bridge this gap, we introduce\nSportsSloMo, a benchmark consisting of more than 130K video clips and 1M video\nframes of high-resolution ($\\geq$720p) slow-motion sports videos crawled from\nYouTube. We re-train several state-of-the-art methods on our benchmark, and the\nresults show a decrease in their accuracy compared to other datasets. It\nhighlights the difficulty of our benchmark and suggests that it poses\nsignificant challenges even for the best-performing methods, as human bodies\nare highly deformable and occlusions are frequent in sports videos. To improve\nthe accuracy, we introduce two loss terms considering the human-aware priors,\nwhere we add auxiliary supervision to panoptic segmentation and human keypoints\ndetection, respectively. The loss terms are model agnostic and can be easily\nplugged into any video frame interpolation approaches. Experimental results\nvalidate the effectiveness of our proposed loss terms, leading to consistent\nperformance improvement over 5 existing models, which establish strong baseline\nmodels on our benchmark. The dataset and code can be found at:\nhttps://neu-vi.github.io/SportsSlomo/.\n","authors":["Jiaben Chen","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.16876v1.pdf","comment":"Project Page: https://neu-vi.github.io/SportsSlomo/"},{"id":"http://arxiv.org/abs/2308.16875v1","updated":"2023-08-31T17:22:18Z","published":"2023-08-31T17:22:18Z","title":"Holistic Processing of Colour Images Using Novel Quaternion-Valued\n Wavelets on the Plane","summary":" We investigate the applicability of quaternion-valued wavelets on the plane\nto holistic colour image processing. We present a methodology for decomposing\nand reconstructing colour images using quaternionic wavelet filters associated\nto recently developed quaternion-valued wavelets on the plane. We consider\ncompression, enhancement, segmentation, and denoising techniques to demonstrate\nquaternion-valued wavelets as a promising tool for holistic colour image\nprocessing.\n","authors":["Neil D. Dizon","Jeffrey A. Hogan"],"pdf_url":"https://arxiv.org/pdf/2308.16875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16863v1","updated":"2023-08-31T17:05:14Z","published":"2023-08-31T17:05:14Z","title":"Self-pruning Graph Neural Network for Predicting Inflammatory Disease\n Activity in Multiple Sclerosis from Brain MR Images","summary":" Multiple Sclerosis (MS) is a severe neurological disease characterized by\ninflammatory lesions in the central nervous system. Hence, predicting\ninflammatory disease activity is crucial for disease assessment and treatment.\nHowever, MS lesions can occur throughout the brain and vary in shape, size and\ntotal count among patients. The high variance in lesion load and locations\nmakes it challenging for machine learning methods to learn a globally effective\nrepresentation of whole-brain MRI scans to assess and predict disease.\nTechnically it is non-trivial to incorporate essential biomarkers such as\nlesion load or spatial proximity. Our work represents the first attempt to\nutilize graph neural networks (GNN) to aggregate these biomarkers for a novel\nglobal representation. We propose a two-stage MS inflammatory disease activity\nprediction approach. First, a 3D segmentation network detects lesions, and a\nself-supervised algorithm extracts their image features. Second, the detected\nlesions are used to build a patient graph. The lesions act as nodes in the\ngraph and are initialized with image features extracted in the first stage.\nFinally, the lesions are connected based on their spatial proximity and the\ninflammatory disease activity prediction is formulated as a graph\nclassification task. Furthermore, we propose a self-pruning strategy to\nauto-select the most critical lesions for prediction. Our proposed method\noutperforms the existing baseline by a large margin (AUCs of 0.67 vs. 0.61 and\n0.66 vs. 0.60 for one-year and two-year inflammatory disease activity,\nrespectively). Finally, our proposed method enjoys inherent explainability by\nassigning an importance score to each lesion for the overall prediction. Code\nis available at https://github.com/chinmay5/ms_ida.git\n","authors":["Chinmay Prabhakar","Hongwei Bran Li","Johannes C. Paetzold","Timo Loehr","Chen Niu","Mark Mühlau","Daniel Rueckert","Benedikt Wiestler","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2308.16863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.20091v3","updated":"2023-08-31T16:45:40Z","published":"2023-05-31T17:59:52Z","title":"Humans in 4D: Reconstructing and Tracking Humans with Transformers","summary":" We present an approach to reconstruct humans and track them over time. At the\ncore of our approach, we propose a fully \"transformerized\" version of a network\nfor human mesh recovery. This network, HMR 2.0, advances the state of the art\nand shows the capability to analyze unusual poses that have in the past been\ndifficult to reconstruct from single images. To analyze video, we use 3D\nreconstructions from HMR 2.0 as input to a tracking system that operates in 3D.\nThis enables us to deal with multiple people and maintain identities through\nocclusion events. Our complete approach, 4DHumans, achieves state-of-the-art\nresults for tracking people from monocular video. Furthermore, we demonstrate\nthe effectiveness of HMR 2.0 on the downstream task of action recognition,\nachieving significant improvements over previous pose-based action recognition\napproaches. Our code and models are available on the project website:\nhttps://shubham-goel.github.io/4dhumans/.\n","authors":["Shubham Goel","Georgios Pavlakos","Jathushan Rajasegaran","Angjoo Kanazawa","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2305.20091v3.pdf","comment":"In ICCV 2023. Project Webpage:\n https://shubham-goel.github.io/4dhumans/"},{"id":"http://arxiv.org/abs/2301.00752v3","updated":"2023-08-31T16:28:50Z","published":"2023-01-02T16:51:40Z","title":"Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave\n Communications","summary":" This study demonstrates the feasibility of point cloud-based proactive link\nquality prediction for millimeter-wave (mmWave) communications. Previous\nstudies have proposed machine learning-based methods to predict received signal\nstrength for future time periods using time series of depth images to mitigate\nthe line-of-sight (LOS) path blockage by pedestrians in mmWave communication.\nHowever, these image-based methods have limited applicability due to privacy\nconcerns as camera images may contain sensitive information. This study\nproposes a point cloud-based method for mmWave link quality prediction and\ndemonstrates its feasibility through experiments. Point clouds represent\nthree-dimensional (3D) spaces as a set of points and are sparser and less\nlikely to contain sensitive information than camera images. Additionally, point\nclouds provide 3D position and motion information, which is necessary for\nunderstanding the radio propagation environment involving pedestrians. This\nstudy designs the mmWave link quality prediction method and conducts realistic\nindoor experiments, where the link quality fluctuates significantly due to\nhuman blockage, using commercially available IEEE 802.11ad-based 60 GHz\nwireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light\ndetection and ranging (LiDAR) for point cloud acquisition. The experimental\nresults showed that our proposed method can predict future large attenuation of\nmmWave received signal strength and throughput induced by the LOS path blockage\nby pedestrians with comparable or superior accuracy to image-based prediction\nmethods. Hence, our point cloud-based method can serve as a viable alternative\nto image-based methods.\n","authors":["Shoki Ohta","Takayuki Nishio","Riichi Kudo","Kahoko Takahashi","Hisashi Nagata"],"pdf_url":"https://arxiv.org/pdf/2301.00752v3.pdf","comment":"Submitted to IEEE Transactions on Machine Learning in Communications\n and Networking"},{"id":"http://arxiv.org/abs/2308.16847v1","updated":"2023-08-31T16:26:17Z","published":"2023-08-31T16:26:17Z","title":"Diffusion Models for Interferometric Satellite Aperture Radar","summary":" Probabilistic Diffusion Models (PDMs) have recently emerged as a very\npromising class of generative models, achieving high performance in natural\nimage generation. However, their performance relative to non-natural images,\nlike radar-based satellite data, remains largely unknown. Generating large\namounts of synthetic (and especially labelled) satellite data is crucial to\nimplement deep-learning approaches for the processing and analysis of\n(interferometric) satellite aperture radar data. Here, we leverage PDMs to\ngenerate several radar-based satellite image datasets. We show that PDMs\nsucceed in generating images with complex and realistic structures, but that\nsampling time remains an issue. Indeed, accelerated sampling strategies, which\nwork well on simple image datasets like MNIST, fail on our radar datasets. We\nprovide a simple and versatile open-source\nhttps://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and\nevaluate PDMs using any dataset on a single GPU.\n","authors":["Alexandre Tuel","Thomas Kerdreux","Claudia Hulbert","Bertrand Rouet-Leduc"],"pdf_url":"https://arxiv.org/pdf/2308.16847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16825v1","updated":"2023-08-31T15:56:29Z","published":"2023-08-31T15:56:29Z","title":"Coarse-to-Fine Amodal Segmentation with Shape Prior","summary":" Amodal object segmentation is a challenging task that involves segmenting\nboth visible and occluded parts of an object. In this paper, we propose a novel\napproach, called Coarse-to-Fine Segmentation (C2F-Seg), that addresses this\nproblem by progressively modeling the amodal segmentation. C2F-Seg initially\nreduces the learning space from the pixel-level image space to the\nvector-quantized latent space. This enables us to better handle long-range\ndependencies and learn a coarse-grained amodal segment from visual features and\nvisible segments. However, this latent space lacks detailed information about\nthe object, which makes it difficult to provide a precise segmentation\ndirectly. To address this issue, we propose a convolution refine module to\ninject fine-grained information and provide a more precise amodal object\nsegmentation based on visual features and coarse-predicted segmentation. To\nhelp the studies of amodal object segmentation, we create a synthetic amodal\ndataset, named as MOViD-Amodal (MOViD-A), which can be used for both image and\nvideo amodal object segmentation. We extensively evaluate our model on two\nbenchmark datasets: KINS and COCO-A. Our empirical results demonstrate the\nsuperiority of C2F-Seg. Moreover, we exhibit the potential of our approach for\nvideo amodal object segmentation tasks on FISHBOWL and our proposed MOViD-A.\nProject page at: http://jianxgao.github.io/C2F-Seg.\n","authors":["Jianxiong Gao","Xuelin Qian","Yikai Wang","Tianjun Xiao","Tong He","Zheng Zhang","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16825v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16819v1","updated":"2023-08-31T15:49:53Z","published":"2023-08-31T15:49:53Z","title":"BTSeg: Barlow Twins Regularization for Domain Adaptation in Semantic\n Segmentation","summary":" Semantic image segmentation is a critical component in many computer vision\nsystems, such as autonomous driving. In such applications, adverse conditions\n(heavy rain, night time, snow, extreme lighting) on the one hand pose specific\nchallenges, yet are typically underrepresented in the available datasets.\nGenerating more training data is cumbersome and expensive, and the process\nitself is error-prone due to the inherent aleatoric uncertainty. To address\nthis challenging problem, we propose BTSeg, which exploits image-level\ncorrespondences as weak supervision signal to learn a segmentation model that\nis agnostic to adverse conditions. To this end, our approach uses the Barlow\ntwins loss from the field of unsupervised learning and treats images taken at\nthe same location but under different adverse conditions as \"augmentations\" of\nthe same unknown underlying base image. This allows the training of a\nsegmentation model that is robust to appearance changes introduced by different\nadverse conditions. We evaluate our approach on ACDC and the new challenging\nACG benchmark to demonstrate its robustness and generalization capabilities.\nOur approach performs favorably when compared to the current state-of-the-art\nmethods, while also being simpler to implement and train. The code will be\nreleased upon acceptance.\n","authors":["Johannes Künzel","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2308.16819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15975v2","updated":"2023-08-31T15:29:44Z","published":"2023-08-30T11:57:04Z","title":"RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation","summary":" For robots to be useful outside labs and specialized factories we need a way\nto teach them new useful behaviors quickly. Current approaches lack either the\ngenerality to onboard new tasks without task-specific engineering, or else lack\nthe data-efficiency to do so in an amount of time that enables practical use.\nIn this work we explore dense tracking as a representational vehicle to allow\nfaster and more general learning from demonstration. Our approach utilizes\nTrack-Any-Point (TAP) models to isolate the relevant motion in a demonstration,\nand parameterize a low-level controller to reproduce this motion across changes\nin the scene configuration. We show this results in robust robot policies that\ncan solve complex object-arrangement tasks such as shape-matching, stacking,\nand even full path-following tasks such as applying glue and sticking objects\ntogether, all from demonstrations that can be collected in minutes.\n","authors":["Mel Vecerik","Carl Doersch","Yi Yang","Todor Davchev","Yusuf Aytar","Guangyao Zhou","Raia Hadsell","Lourdes Agapito","Jon Scholz"],"pdf_url":"https://arxiv.org/pdf/2308.15975v2.pdf","comment":"Project website: https://robotap.github.io"},{"id":"http://arxiv.org/abs/2308.16801v1","updated":"2023-08-31T15:23:33Z","published":"2023-08-31T15:23:33Z","title":"Multiscale Residual Learning of Graph Convolutional Sequence Chunks for\n Human Motion Prediction","summary":" A new method is proposed for human motion prediction by learning temporal and\nspatial dependencies. Recently, multiscale graphs have been developed to model\nthe human body at higher abstraction levels, resulting in more stable motion\nprediction. Current methods however predetermine scale levels and combine\nspatially proximal joints to generate coarser scales based on human priors,\neven though movement patterns in different motion sequences vary and do not\nfully comply with a fixed graph of spatially connected joints. Another problem\nwith graph convolutional methods is mode collapse, in which predicted poses\nconverge around a mean pose with no discernible movements, particularly in\nlong-term predictions. To tackle these issues, we propose ResChunk, an\nend-to-end network which explores dynamically correlated body components based\non the pairwise relationships between all joints in individual sequences.\nResChunk is trained to learn the residuals between target sequence chunks in an\nautoregressive manner to enforce the temporal connectivities between\nconsecutive chunks. It is hence a sequence-to-sequence prediction network which\nconsiders dynamic spatio-temporal features of sequences at multiple levels. Our\nexperiments on two challenging benchmark datasets, CMU Mocap and Human3.6M,\ndemonstrate that our proposed method is able to effectively model the sequence\ninformation for motion prediction and outperform other techniques to set a new\nstate-of-the-art. Our code is available at\nhttps://github.com/MohsenZand/ResChunk.\n","authors":["Mohsen Zand","Ali Etemad","Michael Greenspan"],"pdf_url":"https://arxiv.org/pdf/2308.16801v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2308.16777v1","updated":"2023-08-31T14:55:30Z","published":"2023-08-31T14:55:30Z","title":"Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models","summary":" Zero-shot referring image segmentation is a challenging task because it aims\nto find an instance segmentation mask based on the given referring\ndescriptions, without training on this type of paired data. Current zero-shot\nmethods mainly focus on using pre-trained discriminative models (e.g., CLIP).\nHowever, we have observed that generative models (e.g., Stable Diffusion) have\npotentially understood the relationships between various visual elements and\ntext descriptions, which are rarely investigated in this task. In this work, we\nintroduce a novel Referring Diffusional segmentor (Ref-Diff) for this task,\nwhich leverages the fine-grained multi-modal information from generative\nmodels. We demonstrate that without a proposal generator, a generative model\nalone can achieve comparable performance to existing SOTA weakly-supervised\nmodels. When we combine both generative and discriminative models, our Ref-Diff\noutperforms these competing methods by a significant margin. This indicates\nthat generative models are also beneficial for this task and can complement\ndiscriminative models for better referring segmentation. Our code is publicly\navailable at https://github.com/kodenii/Ref-Diff.\n","authors":["Minheng Ni","Yabo Zhang","Kailai Feng","Xiaoming Li","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.16777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16758v1","updated":"2023-08-31T14:26:33Z","published":"2023-08-31T14:26:33Z","title":"Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation\n Using only Images","summary":" Generating 3D faces from textual descriptions has a multitude of\napplications, such as gaming, movie, and robotics. Recent progresses have\ndemonstrated the success of unconditional 3D face generation and text-to-3D\nshape generation. However, due to the limited text-3D face data pairs,\ntext-driven 3D face generation remains an open problem. In this paper, we\npropose a text-guided 3D faces generation method, refer as TG-3DFace, for\ngenerating realistic 3D faces using text guidance. Specifically, we adopt an\nunconditional 3D face generation framework and equip it with text conditions,\nwhich learns the text-guided 3D face generation with only text-2D face data. On\ntop of that, we propose two text-to-face cross-modal alignment techniques,\nincluding the global contrastive learning and the fine-grained alignment\nmodule, to facilitate high semantic consistency between generated 3D faces and\ninput texts. Besides, we present directional classifier guidance during the\ninference process, which encourages creativity for out-of-domain generations.\nCompared to the existing methods, TG-3DFace creates more realistic and\naesthetically pleasing 3D faces, boosting 9% multi-view consistency (MVIC) over\nLatent3D. The rendered face images generated by TG-3DFace achieve higher FID\nand CLIP score than text-to-2D face/image generation models, demonstrating our\nsuperiority in generating realistic and semantic-consistent textures.\n","authors":["Cuican Yu","Guansong Lu","Yihan Zeng","Jian Sun","Xiaodan Liang","Huibin Li","Zongben Xu","Songcen Xu","Wei Zhang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.16758v1.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.13241v4","updated":"2023-08-31T14:15:53Z","published":"2023-03-23T13:18:05Z","title":"6D Object Pose Estimation from Approximate 3D Models for Orbital\n Robotics","summary":" We present a novel technique to estimate the 6D pose of objects from single\nimages where the 3D geometry of the object is only given approximately and not\nas a precise 3D model. To achieve this, we employ a dense 2D-to-3D\ncorrespondence predictor that regresses 3D model coordinates for every pixel.\nIn addition to the 3D coordinates, our model also estimates the pixel-wise\ncoordinate error to discard correspondences that are likely wrong. This allows\nus to generate multiple 6D pose hypotheses of the object, which we then refine\niteratively using a highly efficient region-based approach. We also introduce a\nnovel pixel-wise posterior formulation by which we can estimate the probability\nfor each hypothesis and select the most likely one. As we show in experiments,\nour approach is capable of dealing with extreme visual conditions including\noverexposure, high contrast, or low signal-to-noise ratio. This makes it a\npowerful technique for the particularly challenging task of estimating the pose\nof tumbling satellites for in-orbit robotic applications. Our method achieves\nstate-of-the-art performance on the SPEED+ dataset and has won the SPEC2021\npost-mortem competition.\n","authors":["Maximilian Ulmer","Maximilian Durner","Martin Sundermeyer","Manuel Stoiber","Rudolph Triebel"],"pdf_url":"https://arxiv.org/pdf/2303.13241v4.pdf","comment":"Proceedings of IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2304.05821v2","updated":"2023-08-31T14:15:51Z","published":"2023-04-12T12:59:02Z","title":"DUFormer: Solving Power Line Detection Task in Aerial Images using\n Semantic Segmentation","summary":" Unmanned aerial vehicles (UAVs) are frequently used for inspecting power\nlines and capturing high-resolution aerial images. However, detecting power\nlines in aerial images is difficult,as the foreground data(i.e, power lines) is\nsmall and the background information is abundant.To tackle this problem, we\nintroduce DUFormer, a semantic segmentation algorithm explicitly designed to\ndetect power lines in aerial images. We presuppose that it is advantageous to\ntrain an efficient Transformer model with sufficient feature extraction using a\nconvolutional neural network(CNN) with a strong inductive bias.With this goal\nin mind, we introduce a heavy token encoder that performs overlapping feature\nremodeling and tokenization. The encoder comprises a pyramid CNN feature\nextraction module and a power line feature enhancement module.After successful\nlocal feature extraction for power lines, feature fusion is conducted.Then,the\nTransformer block is used for global modeling. The final segmentation result is\nachieved by amalgamating local and global features in the decode head.Moreover,\nwe demonstrate the importance of the joint multi-weight loss function in power\nline segmentation. Our experimental results show that our proposed method\noutperforms all state-of-the-art methods in power line segmentation on the\npublicly accessible TTPLA dataset.\n","authors":["Deyu An","Qiang Zhang","Jianshu Chao","Ting Li","Feng Qiao","Yong Deng","Zhenpeng Bian"],"pdf_url":"https://arxiv.org/pdf/2304.05821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16742v1","updated":"2023-08-31T14:00:47Z","published":"2023-08-31T14:00:47Z","title":"Unsupervised CT Metal Artifact Reduction by Plugging Diffusion Priors in\n Dual Domains","summary":" During the process of computed tomography (CT), metallic implants often cause\ndisruptive artifacts in the reconstructed images, impeding accurate diagnosis.\nSeveral supervised deep learning-based approaches have been proposed for\nreducing metal artifacts (MAR). However, these methods heavily rely on training\nwith simulated data, as obtaining paired metal artifact CT and clean CT data in\nclinical settings is challenging. This limitation can lead to decreased\nperformance when applying these methods in clinical practice. Existing\nunsupervised MAR methods, whether based on learning or not, typically operate\nwithin a single domain, either in the image domain or the sinogram domain. In\nthis paper, we propose an unsupervised MAR method based on the diffusion model,\na generative model with a high capacity to represent data distributions.\nSpecifically, we first train a diffusion model using CT images without metal\nartifacts. Subsequently, we iteratively utilize the priors embedded within the\npre-trained diffusion model in both the sinogram and image domains to restore\nthe degraded portions caused by metal artifacts. This dual-domain processing\nempowers our approach to outperform existing unsupervised MAR methods,\nincluding another MAR method based on the diffusion model, which we have\nqualitatively and quantitatively validated using synthetic datasets. Moreover,\nour method demonstrates superior visual results compared to both supervised and\nunsupervised methods on clinical datasets.\n","authors":["Xuan Liu","Yaoqin Xie","Songhui Diao","Shan Tan","Xiaokun Liang"],"pdf_url":"https://arxiv.org/pdf/2308.16742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16741v1","updated":"2023-08-31T13:59:35Z","published":"2023-08-31T13:59:35Z","title":"Socratis: Are large multimodal models emotionally aware?","summary":" Existing emotion prediction benchmarks contain coarse emotion labels which do\nnot consider the diversity of emotions that an image and text can elicit in\nhumans due to various reasons. Learning diverse reactions to multimodal content\nis important as intelligent machines take a central role in generating and\ndelivering content to society. To address this gap, we propose Socratis, a\n\\underline{soc}ietal \\underline{r}e\\underline{a}c\\underline{ti}on\\underline{s}\nbenchmark, where each image-caption (IC) pair is annotated with multiple\nemotions and the reasons for feeling them. Socratis contains 18K free-form\nreactions for 980 emotions on 2075 image-caption pairs from 5 widely-read news\nand image-caption (IC) datasets. We benchmark the capability of\nstate-of-the-art multimodal large language models to generate the reasons for\nfeeling an emotion given an IC pair. Based on a preliminary human study, we\nobserve that humans prefer human-written reasons over 2 times more often than\nmachine-generated ones. This shows our task is harder than standard generation\ntasks because it starkly contrasts recent findings where humans cannot tell\napart machine vs human-written news articles, for instance. We further see that\ncurrent captioning metrics based on large vision-language models also fail to\ncorrelate with human preferences. We hope that these findings and our benchmark\nwill inspire further research on training emotionally aware models.\n","authors":["Katherine Deng","Arijit Ray","Reuben Tan","Saadia Gabriel","Bryan A. Plummer","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2308.16741v1.pdf","comment":"ICCV 2023 WECIA"},{"id":"http://arxiv.org/abs/2308.16739v1","updated":"2023-08-31T13:57:38Z","published":"2023-08-31T13:57:38Z","title":"Parsing is All You Need for Accurate Gait Recognition in the Wild","summary":" Binary silhouettes and keypoint-based skeletons have dominated human gait\nrecognition studies for decades since they are easy to extract from video\nframes. Despite their success in gait recognition for in-the-lab environments,\nthey usually fail in real-world scenarios due to their low information entropy\nfor gait representations. To achieve accurate gait recognition in the wild,\nthis paper presents a novel gait representation, named Gait Parsing Sequence\n(GPS). GPSs are sequences of fine-grained human segmentation, i.e., human\nparsing, extracted from video frames, so they have much higher information\nentropy to encode the shapes and dynamics of fine-grained human parts during\nwalking. Moreover, to effectively explore the capability of the GPS\nrepresentation, we propose a novel human parsing-based gait recognition\nframework, named ParsingGait. ParsingGait contains a Convolutional Neural\nNetwork (CNN)-based backbone and two light-weighted heads. The first head\nextracts global semantic features from GPSs, while the other one learns mutual\ninformation of part-level features through Graph Convolutional Networks to\nmodel the detailed dynamics of human walking. Furthermore, due to the lack of\nsuitable datasets, we build the first parsing-based dataset for gait\nrecognition in the wild, named Gait3D-Parsing, by extending the large-scale and\nchallenging Gait3D dataset. Based on Gait3D-Parsing, we comprehensively\nevaluate our method and existing gait recognition methods. The experimental\nresults show a significant improvement in accuracy brought by the GPS\nrepresentation and the superiority of ParsingGait. The code and dataset are\navailable at https://gait3d.github.io/gait3d-parsing-hp .\n","authors":["Jinkai Zheng","Xinchen Liu","Shuai Wang","Lihao Wang","Chenggang Yan","Wu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16739v1.pdf","comment":"16 pages, 14 figures, ACM MM 2023 accepted, project page:\n https://gait3d.github.io/gait3d-parsing-hp"},{"id":"http://arxiv.org/abs/2308.16738v1","updated":"2023-08-31T13:54:57Z","published":"2023-08-31T13:54:57Z","title":"US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for\n Cervical Lymph Node Lesions Diagnoses in Ultrasound Images","summary":" Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph\nnode lesions. However, the diagnoses of these images largely hinge on the\nexpertise of medical practitioners, rendering the process susceptible to\nmisdiagnoses. Although rapidly developing deep learning has substantially\nimproved the diagnoses of diverse ultrasound images, there remains a\nconspicuous research gap concerning cervical lymph nodes. The objective of our\nwork is to accurately diagnose cervical lymph node lesions by leveraging a deep\nlearning model. To this end, we first collected 3392 images containing normal\nlymph nodes, benign lymph node lesions, malignant primary lymph node lesions,\nand malignant metastatic lymph node lesions. Given that ultrasound images are\ngenerated by the reflection and scattering of sound waves across varied bodily\ntissues, we proposed the Conv-FFT Block. It integrates convolutional operations\nwith the fast Fourier transform to more astutely model the images. Building\nupon this foundation, we designed a novel architecture, named US-SFNet. This\narchitecture not only discerns variances in ultrasound images from the spatial\ndomain but also adeptly captures microstructural alterations across various\nlesions in the frequency domain. To ascertain the potential of US-SFNet, we\nbenchmarked it against 12 popular architectures through five-fold\ncross-validation. The results show that US-SFNet is SOTA and can achieve 92.89%\naccuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity,\nrespectively.\n","authors":["Yubiao Yue","Jun Xue","Haihua Liang","Bingchun Luo","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.16738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16735v1","updated":"2023-08-31T13:52:28Z","published":"2023-08-31T13:52:28Z","title":"Post-Deployment Adaptation with Access to Source Data via Federated\n Learning and Source-Target Remote Gradient Alignment","summary":" Deployment of Deep Neural Networks in medical imaging is hindered by\ndistribution shift between training data and data processed after deployment,\ncausing performance degradation. Post-Deployment Adaptation (PDA) addresses\nthis by tailoring a pre-trained, deployed model to the target data distribution\nusing limited labelled or entirely unlabelled target data, while assuming no\naccess to source training data as they cannot be deployed with the model due to\nprivacy concerns and their large size. This makes reliable adaptation\nchallenging due to limited learning signal. This paper challenges this\nassumption and introduces FedPDA, a novel adaptation framework that brings the\nutility of learning from remote data from Federated Learning into PDA. FedPDA\nenables a deployed model to obtain information from source data via remote\ngradient exchange, while aiming to optimize the model specifically for the\ntarget domain. Tailored for FedPDA, we introduce a novel optimization method\nStarAlign (Source-Target Remote Gradient Alignment) that aligns gradients\nbetween source-target domain pairs by maximizing their inner product, to\nfacilitate learning a target-specific model. We demonstrate the method's\neffectiveness using multi-center databases for the tasks of cancer metastases\ndetection and skin lesion classification, where our method compares favourably\nto previous work. Code is available at: https://github.com/FelixWag/StarAlign\n","authors":["Felix Wagner","Zeju Li","Pramit Saha","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2308.16735v1.pdf","comment":"This version was accepted for the Machine Learning in Medical Imaging\n (MLMI 2023) workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.17595v2","updated":"2023-08-31T13:45:28Z","published":"2023-06-30T12:14:13Z","title":"RBSR: Efficient and Flexible Recurrent Network for Burst\n Super-Resolution","summary":" Burst super-resolution (BurstSR) aims at reconstructing a high-resolution\n(HR) image from a sequence of low-resolution (LR) and noisy images, which is\nconducive to enhancing the imaging effects of smartphones with limited sensors.\nThe main challenge of BurstSR is to effectively combine the complementary\ninformation from input frames, while existing methods still struggle with it.\nIn this paper, we suggest fusing cues frame-by-frame with an efficient and\nflexible recurrent network. In particular, we emphasize the role of the\nbase-frame and utilize it as a key prompt to guide the knowledge acquisition\nfrom other frames in every recurrence. Moreover, we introduce an implicit\nweighting loss to improve the model's flexibility in facing input frames with\nvariable numbers. Extensive experiments on both synthetic and real-world\ndatasets demonstrate that our method achieves better results than\nstate-of-the-art ones. Codes and pre-trained models are available at\nhttps://github.com/ZcsrenlongZ/RBSR.\n","authors":["Renlong Wu","Zhilu Zhang","Shuohao Zhang","Hongzhi Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2306.17595v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.16725v1","updated":"2023-08-31T13:41:34Z","published":"2023-08-31T13:41:34Z","title":"Terrain Diffusion Network: Climatic-Aware Terrain Generation with\n Geological Sketch Guidance","summary":" Sketch-based terrain generation seeks to create realistic landscapes for\nvirtual environments in various applications such as computer games, animation\nand virtual reality. Recently, deep learning based terrain generation has\nemerged, notably the ones based on generative adversarial networks (GAN).\nHowever, these methods often struggle to fulfill the requirements of flexible\nuser control and maintain generative diversity for realistic terrain.\nTherefore, we propose a novel diffusion-based method, namely terrain diffusion\nnetwork (TDN), which actively incorporates user guidance for enhanced\ncontrollability, taking into account terrain features like rivers, ridges,\nbasins, and peaks. Instead of adhering to a conventional monolithic denoising\nprocess, which often compromises the fidelity of terrain details or the\nalignment with user control, a multi-level denoising scheme is proposed to\ngenerate more realistic terrains by taking into account fine-grained details,\nparticularly those related to climatic patterns influenced by erosion and\ntectonic activities. Specifically, three terrain synthesisers are designed for\nstructural, intermediate, and fine-grained level denoising purposes, which\nallow each synthesiser concentrate on a distinct terrain aspect. Moreover, to\nmaximise the efficiency of our TDN, we further introduce terrain and sketch\nlatent spaces for the synthesizers with pre-trained terrain autoencoders.\nComprehensive experiments on a new dataset constructed from NASA Topology\nImages clearly demonstrate the effectiveness of our proposed method, achieving\nthe state-of-the-art performance. Our code and dataset will be publicly\navailable.\n","authors":["Zexin Hu","Kun Hu","Clinton Mo","Lei Pan","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16714v1","updated":"2023-08-31T13:28:32Z","published":"2023-08-31T13:28:32Z","title":"Towards Vehicle-to-everything Autonomous Driving: A Survey on\n Collaborative Perception","summary":" Vehicle-to-everything (V2X) autonomous driving opens up a promising direction\nfor developing a new generation of intelligent transportation systems.\nCollaborative perception (CP) as an essential component to achieve V2X can\novercome the inherent limitations of individual perception, including occlusion\nand long-range perception. In this survey, we provide a comprehensive review of\nCP methods for V2X scenarios, bringing a profound and in-depth understanding to\nthe community. Specifically, we first introduce the architecture and workflow\nof typical V2X systems, which affords a broader perspective to understand the\nentire V2X system and the role of CP within it. Then, we thoroughly summarize\nand analyze existing V2X perception datasets and CP methods. Particularly, we\nintroduce numerous CP methods from various crucial perspectives, including\ncollaboration stages, roadside sensors placement, latency compensation,\nperformance-bandwidth trade-off, attack/defense, pose alignment, etc. Moreover,\nwe conduct extensive experimental analyses to compare and examine current CP\nmethods, revealing some essential and unexplored insights. Specifically, we\nanalyze the performance changes of different methods under different\nbandwidths, providing a deep insight into the performance-bandwidth trade-off\nissue. Also, we examine methods under different LiDAR ranges. To study the\nmodel robustness, we further investigate the effects of various simulated\nreal-world noises on the performance of different CP methods, covering\ncommunication latency, lossy communication, localization errors, and mixed\nnoises. In addition, we look into the sim-to-real generalization ability of\nexisting CP methods. At last, we thoroughly discuss issues and challenges,\nhighlighting promising directions for future efforts. Our codes for\nexperimental analysis will be public at\nhttps://github.com/memberRE/Collaborative-Perception.\n","authors":["Si Liu","Chen Gao","Yuan Chen","Xingyu Peng","Xianghao Kong","Kun Wang","Runsheng Xu","Wentao Jiang","Hao Xiang","Jiaqi Ma","Miao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16714v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2307.14863v3","updated":"2023-08-31T13:25:59Z","published":"2023-07-27T13:49:27Z","title":"IML-ViT: Benchmarking Image Manipulation Localization by Vision\n Transformer","summary":" Advanced image tampering techniques are increasingly challenging the\ntrustworthiness of multimedia, leading to the development of Image Manipulation\nLocalization (IML). But what makes a good IML model? The answer lies in the way\nto capture artifacts. Exploiting artifacts requires the model to extract\nnon-semantic discrepancies between manipulated and authentic regions,\nnecessitating explicit comparisons between the two areas. With the\nself-attention mechanism, naturally, the Transformer should be a better\ncandidate to capture artifacts. However, due to limited datasets, there is\ncurrently no pure ViT-based approach for IML to serve as a benchmark, and CNNs\ndominate the entire task. Nevertheless, CNNs suffer from weak long-range and\nnon-semantic modeling. To bridge this gap, based on the fact that artifacts are\nsensitive to image resolution, amplified under multi-scale features, and\nmassive at the manipulation border, we formulate the answer to the former\nquestion as building a ViT with high-resolution capacity, multi-scale feature\nextraction capability, and manipulation edge supervision that could converge\nwith a small amount of data. We term this simple but effective ViT paradigm\nIML-ViT, which has significant potential to become a new benchmark for IML.\nExtensive experiments on five benchmark datasets verified our model outperforms\nthe state-of-the-art manipulation localization methods.Code and models are\navailable at \\url{https://github.com/SunnyHaze/IML-ViT}.\n","authors":["Xiaochen Ma","Bo Du","Zhuohang Jiang","Ahmed Y. Al Hammadi","Jizhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.14863v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14505v2","updated":"2023-08-31T13:10:04Z","published":"2023-04-03T11:45:27Z","title":"Transformer-based interpretable multi-modal data fusion for skin lesion\n classification","summary":" A lot of deep learning (DL) research these days is mainly focused on\nimproving quantitative metrics regardless of other factors. In human-centered\napplications, like skin lesion classification in dermatology, DL-driven\nclinical decision support systems are still in their infancy due to the limited\ntransparency of their decision-making process. Moreover, the lack of procedures\nthat can explain the behavior of trained DL algorithms leads to almost no trust\nfrom clinical physicians. To diagnose skin lesions, dermatologists rely on\nvisual assessment of the disease and the data gathered from the patient's\nanamnesis. Data-driven algorithms dealing with multi-modal data are limited by\nthe separation of feature-level and decision-level fusion procedures required\nby convolutional architectures. To address this issue, we enable single-stage\nmulti-modal data fusion via the attention mechanism of transformer-based\narchitectures to aid in diagnosing skin diseases. Our method beats other\nstate-of-the-art single- and multi-modal DL architectures in image-rich and\npatient-data-rich environments. Additionally, the choice of the architecture\nenables native interpretability support for the classification task both in the\nimage and metadata domain with no additional modifications necessary.\n","authors":["Theodor Cheslerean-Boghiu","Melia-Evelina Fleischmann","Theresa Willem","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2304.14505v2.pdf","comment":"Submitted to IEEE JBHI in July 2023"},{"id":"http://arxiv.org/abs/2303.07806v2","updated":"2023-08-31T13:00:55Z","published":"2023-03-14T11:25:02Z","title":"USAGE: A Unified Seed Area Generation Paradigm for Weakly Supervised\n Semantic Segmentation","summary":" Seed area generation is usually the starting point of weakly supervised\nsemantic segmentation (WSSS). Computing the Class Activation Map (CAM) from a\nmulti-label classification network is the de facto paradigm for seed area\ngeneration, but CAMs generated from Convolutional Neural Networks (CNNs) and\nTransformers are prone to be under- and over-activated, respectively, which\nmakes the strategies to refine CAMs for CNNs usually inappropriate for\nTransformers, and vice versa. In this paper, we propose a Unified optimization\nparadigm for Seed Area GEneration (USAGE) for both types of networks, in which\nthe objective function to be optimized consists of two terms: One is a\ngeneration loss, which controls the shape of seed areas by a temperature\nparameter following a deterministic principle for different types of networks;\nThe other is a regularization loss, which ensures the consistency between the\nseed areas that are generated by self-adaptive network adjustment from\ndifferent views, to overturn false activation in seed areas. Experimental\nresults show that USAGE consistently improves seed area generation for both\nCNNs and Transformers by large margins, e.g., outperforming state-of-the-art\nmethods by a mIoU of 4.1% on PASCAL VOC. Moreover, based on the USAGE-generated\nseed areas on Transformers, we achieve state-of-the-art WSSS results on both\nPASCAL VOC and MS COCO.\n","authors":["Zelin Peng","Guanchun Wang","Lingxi Xie","Dongsheng Jiang","Wei Shen","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2303.07806v2.pdf","comment":"ICCV 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2308.16689v1","updated":"2023-08-31T12:46:36Z","published":"2023-08-31T12:46:36Z","title":"ViLTA: Enhancing Vision-Language Pre-training through Textual\n Augmentation","summary":" Vision-language pre-training (VLP) methods are blossoming recently, and its\ncrucial goal is to jointly learn visual and textual features via a\ntransformer-based architecture, demonstrating promising improvements on a\nvariety of vision-language tasks. Prior arts usually focus on how to align\nvisual and textual features, but strategies for improving the robustness of\nmodel and speeding up model convergence are left insufficiently explored.\n In this paper, we propose a novel method ViLTA, comprising of two components\nto further facilitate the model to learn fine-grained representations among\nimage-text pairs. For Masked Language Modeling (MLM), we propose a\ncross-distillation method to generate soft labels to enhance the robustness of\nmodel, which alleviates the problem of treating synonyms of masked words as\nnegative samples in one-hot labels. For Image-Text Matching (ITM), we leverage\nthe current language encoder to synthesize hard negatives based on the context\nof language input, encouraging the model to learn high-quality representations\nby increasing the difficulty of the ITM task. By leveraging the above\ntechniques, our ViLTA can achieve better performance on various vision-language\ntasks. Extensive experiments on benchmark datasets demonstrate that the\neffectiveness of ViLTA and its promising potential for vision-language\npre-training.\n","authors":["Weihan Wang","Zhen Yang","Bin Xu","Juanzi Li","Yankui Sun"],"pdf_url":"https://arxiv.org/pdf/2308.16689v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2112.08060v2","updated":"2023-08-31T12:41:13Z","published":"2021-12-15T11:55:11Z","title":"Leveraging Image-based Generative Adversarial Networks for Time Series\n Generation","summary":" Generative models for images have gained significant attention in computer\nvision and natural language processing due to their ability to generate\nrealistic samples from complex data distributions. To leverage the advances of\nimage-based generative models for the time series domain, we propose a\ntwo-dimensional image representation for time series, the Extended\nIntertemporal Return Plot (XIRP). Our approach captures the intertemporal time\nseries dynamics in a scale-invariant and invertible way, reducing training time\nand improving sample quality. We benchmark synthetic XIRPs obtained by an\noff-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image\nrepresentations and models regarding similarity and predictive ability metrics.\nOur novel, validated image representation for time series consistently and\nsignificantly outperforms a state-of-the-art RNN-based generative model\nregarding predictive ability. Further, we introduce an improved stochastic\ninversion to substantially improve simulation quality regardless of the\nrepresentation and provide the prospect of transfer potentials in other\ndomains.\n","authors":["Justin Hellermann","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2112.08060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16684v1","updated":"2023-08-31T12:38:29Z","published":"2023-08-31T12:38:29Z","title":"Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor\n Attack","summary":" The vulnerabilities to backdoor attacks have recently threatened the\ntrustworthiness of machine learning models in practical applications.\nConventional wisdom suggests that not everyone can be an attacker since the\nprocess of designing the trigger generation algorithm often involves\nsignificant effort and extensive experimentation to ensure the attack's\nstealthiness and effectiveness. Alternatively, this paper shows that there\nexists a more severe backdoor threat: anyone can exploit an easily-accessible\nalgorithm for silent backdoor attacks. Specifically, this attacker can employ\nthe widely-used lossy image compression from a plethora of compression tools to\neffortlessly inject a trigger pattern into an image without leaving any\nnoticeable trace; i.e., the generated triggers are natural artifacts. One does\nnot require extensive knowledge to click on the \"convert\" or \"save as\" button\nwhile using tools for lossy image compression. Via this attack, the adversary\ndoes not need to design a trigger generator as seen in prior works and only\nrequires poisoning the data. Empirically, the proposed attack consistently\nachieves 100% attack success rate in several benchmark datasets such as MNIST,\nCIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still\nachieve almost 100% attack success rate with very small (approximately 10%)\npoisoning rates in the clean label setting. The generated trigger of the\nproposed attack using one lossy compression algorithm is also transferable\nacross other related compression algorithms, exacerbating the severity of this\nbackdoor threat. This work takes another crucial step toward understanding the\nextensive risks of backdoor attacks in practice, urging practitioners to\ninvestigate similar attacks and relevant backdoor mitigation methods.\n","authors":["Sze Jue Yang","Quang Nguyen","Chee Seng Chan","Khoa Doan"],"pdf_url":"https://arxiv.org/pdf/2308.16684v1.pdf","comment":"14 pages. This paper shows everyone can mount a powerful and stealthy\n backdoor attack with the widely-used lossy image compression"},{"id":"http://arxiv.org/abs/2308.16682v1","updated":"2023-08-31T12:36:50Z","published":"2023-08-31T12:36:50Z","title":"Diffusion Inertial Poser: Human Motion Reconstruction from Arbitrary\n Sparse IMU Configurations","summary":" Motion capture from a limited number of inertial measurement units (IMUs) has\nimportant applications in health, human performance, and virtual reality.\nReal-world limitations and application-specific goals dictate different IMU\nconfigurations (i.e., number of IMUs and chosen attachment body segments),\ntrading off accuracy and practicality. Although recent works were successful in\naccurately reconstructing whole-body motion from six IMUs, these systems only\nwork with a specific IMU configuration. Here we propose a single diffusion\ngenerative model, Diffusion Inertial Poser (DiffIP), which reconstructs human\nmotion in real-time from arbitrary IMU configurations. We show that DiffIP has\nthe benefit of flexibility with respect to the IMU configuration while being as\naccurate as the state-of-the-art for the commonly used six IMU configuration.\nOur system enables selecting an optimal configuration for different\napplications without retraining the model. For example, when only four IMUs are\navailable, DiffIP found that the configuration that minimizes errors in joint\nkinematics instruments the thighs and forearms. However, global translation\nreconstruction is better when instrumenting the feet instead of the thighs.\nAlthough our approach is agnostic to the underlying model, we built DiffIP\nbased on physiologically realistic musculoskeletal models to enable use in\nbiomedical research and health applications.\n","authors":["Tom Van Wouwe","Seunghwan Lee","Antoine Falisse","Scott Delp","C. Karen Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10892v2","updated":"2023-08-31T12:09:45Z","published":"2022-11-20T07:30:15Z","title":"Towards Realistic Out-of-Distribution Detection: A Novel Evaluation\n Framework for Improving Generalization in OOD Detection","summary":" This paper presents a novel evaluation framework for Out-of-Distribution\n(OOD) detection that aims to assess the performance of machine learning models\nin more realistic settings. We observed that the real-world requirements for\ntesting OOD detection methods are not satisfied by the current testing\nprotocols. They usually encourage methods to have a strong bias towards a low\nlevel of diversity in normal data. To address this limitation, we propose new\nOOD test datasets (CIFAR-10-R, CIFAR-100-R, and ImageNet-30-R) that can allow\nresearchers to benchmark OOD detection performance under realistic distribution\nshifts. Additionally, we introduce a Generalizability Score (GS) to measure the\ngeneralization ability of a model during OOD detection. Our experiments\ndemonstrate that improving the performance on existing benchmark datasets does\nnot necessarily improve the usability of OOD detection models in real-world\nscenarios. While leveraging deep pre-trained features has been identified as a\npromising avenue for OOD detection research, our experiments show that\nstate-of-the-art pre-trained models tested on our proposed datasets suffer a\nsignificant drop in performance. To address this issue, we propose a\npost-processing stage for adapting pre-trained features under these\ndistribution shifts before calculating the OOD scores, which significantly\nenhances the performance of state-of-the-art pre-trained models on our\nbenchmarks.\n","authors":["Vahid Reza Khazaie","Anthony Wong","Mohammad Sabokrou"],"pdf_url":"https://arxiv.org/pdf/2211.10892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14500v3","updated":"2023-08-31T12:02:47Z","published":"2023-08-28T11:20:48Z","title":"LAC: Latent Action Composition for Skeleton-based Action Segmentation","summary":" Skeleton-based action segmentation requires recognizing composable actions in\nuntrimmed videos. Current approaches decouple this problem by first extracting\nlocal visual features from skeleton sequences and then processing them by a\ntemporal model to classify frame-wise actions. However, their performances\nremain limited as the visual features cannot sufficiently express composable\nactions. In this context, we propose Latent Action Composition (LAC), a novel\nself-supervised framework aiming at learning from synthesized composable\nmotions for skeleton-based action segmentation. LAC is composed of a novel\ngeneration module towards synthesizing new sequences. Specifically, we design a\nlinear latent space in the generator to represent primitive motion. New\ncomposed motions can be synthesized by simply performing arithmetic operations\non latent representations of multiple input skeleton sequences. LAC leverages\nsuch synthesized sequences, which have large diversity and complexity, for\nlearning visual representations of skeletons in both sequence and frame spaces\nvia contrastive learning. The resulting visual encoder has a high expressive\npower and can be effectively transferred onto action segmentation tasks by\nend-to-end fine-tuning without the need for additional temporal models. We\nconduct a study focusing on transfer-learning and we show that representations\nlearned from pre-trained LAC outperform the state-of-the-art by a large margin\non TSU, Charades, PKU-MMD datasets.\n","authors":["Di Yang","Yaohui Wang","Antitza Dantcheva","Quan Kong","Lorenzo Garattoni","Gianpiero Francesca","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2308.14500v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16651v1","updated":"2023-08-31T11:51:16Z","published":"2023-08-31T11:51:16Z","title":"SoccerNet 2023 Tracking Challenge -- 3rd place MOT4MOT Team Technical\n Report","summary":" The SoccerNet 2023 tracking challenge requires the detection and tracking of\nsoccer players and the ball. In this work, we present our approach to tackle\nthese tasks separately. We employ a state-of-the-art online multi-object\ntracker and a contemporary object detector for player tracking. To overcome the\nlimitations of our online approach, we incorporate a post-processing stage\nusing interpolation and appearance-free track merging. Additionally, an\nappearance-based track merging technique is used to handle the termination and\ncreation of tracks far from the image boundaries. Ball tracking is formulated\nas single object detection, and a fine-tuned YOLOv8l detector with proprietary\nfiltering improves the detection precision. Our method achieves 3rd place on\nthe SoccerNet 2023 tracking challenge with a HOTA score of 66.27.\n","authors":["Gal Shitrit","Ishay Be'ery","Ido Yerhushalmy"],"pdf_url":"https://arxiv.org/pdf/2308.16651v1.pdf","comment":"3 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.16649v1","updated":"2023-08-31T11:46:27Z","published":"2023-08-31T11:46:27Z","title":"Learning with Multi-modal Gradient Attention for Explainable Composed\n Image Retrieval","summary":" We consider the problem of composed image retrieval that takes an input query\nconsisting of an image and a modification text indicating the desired changes\nto be made on the image and retrieves images that match these changes. Current\nstate-of-the-art techniques that address this problem use global features for\nthe retrieval, resulting in incorrect localization of the regions of interest\nto be modified because of the global nature of the features, more so in cases\nof real-world, in-the-wild images. Since modifier texts usually correspond to\nspecific local changes in an image, it is critical that models learn local\nfeatures to be able to both localize and retrieve better. To this end, our key\nnovelty is a new gradient-attention-based learning objective that explicitly\nforces the model to focus on the local regions of interest being modified in\neach retrieval step. We achieve this by first proposing a new visual image\nattention computation technique, which we call multi-modal gradient attention\n(MMGrad) that is explicitly conditioned on the modifier text. We next\ndemonstrate how MMGrad can be incorporated into an end-to-end model training\nstrategy with a new learning objective that explicitly forces these MMGrad\nattention maps to highlight the correct local regions corresponding to the\nmodifier text. By training retrieval models with this new loss function, we\nshow improved grounding by means of better visual attention maps, leading to\nbetter explainability of the models as well as competitive quantitative\nretrieval performance on standard benchmark datasets.\n","authors":["Prateksha Udhayanan","Srikrishna Karanam","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2308.16649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16648v1","updated":"2023-08-31T11:44:40Z","published":"2023-08-31T11:44:40Z","title":"Generate Your Own Scotland: Satellite Image Generation Conditioned on\n Maps","summary":" Despite recent advancements in image generation, diffusion models still\nremain largely underexplored in Earth Observation. In this paper we show that\nstate-of-the-art pretrained diffusion models can be conditioned on cartographic\ndata to generate realistic satellite images. We provide two large datasets of\npaired OpenStreetMap images and satellite views over the region of Mainland\nScotland and the Central Belt. We train a ControlNet model and qualitatively\nevaluate the results, demonstrating that both image quality and map fidelity\nare possible. Finally, we provide some insights on the opportunities and\nchallenges of applying these models for remote sensing. Our model weights and\ncode for creating the dataset are publicly available at\nhttps://github.com/miquel-espinosa/map-sat.\n","authors":["Miguel Espinosa","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2308.16648v1.pdf","comment":"13 pages, 6 figures. preprint"},{"id":"http://arxiv.org/abs/2302.07669v2","updated":"2023-08-31T11:24:15Z","published":"2023-02-15T14:06:39Z","title":"Unsupervised Hashing with Similarity Distribution Calibration","summary":" Unsupervised hashing methods typically aim to preserve the similarity between\ndata points in a feature space by mapping them to binary hash codes. However,\nthese methods often overlook the fact that the similarity between data points\nin the continuous feature space may not be preserved in the discrete hash code\nspace, due to the limited similarity range of hash codes. The similarity range\nis bounded by the code length and can lead to a problem known as similarity\ncollapse. That is, the positive and negative pairs of data points become less\ndistinguishable from each other in the hash space. To alleviate this problem,\nin this paper a novel Similarity Distribution Calibration (SDC) method is\nintroduced. SDC aligns the hash code similarity distribution towards a\ncalibration distribution (e.g., beta distribution) with sufficient spread\nacross the entire similarity range, thus alleviating the similarity collapse\nproblem. Extensive experiments show that our SDC outperforms significantly the\nstate-of-the-art alternatives on coarse category-level and instance-level image\nretrieval. Code is available at https://github.com/kamwoh/sdc.\n","authors":["Kam Woh Ng","Xiatian Zhu","Jiun Tian Hoe","Chee Seng Chan","Tianyu Zhang","Yi-Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2302.07669v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2308.16637v1","updated":"2023-08-31T11:11:38Z","published":"2023-08-31T11:11:38Z","title":"Learning Channel Importance for High Content Imaging with Interpretable\n Deep Input Channel Mixing","summary":" Uncovering novel drug candidates for treating complex diseases remain one of\nthe most challenging tasks in early discovery research. To tackle this\nchallenge, biopharma research established a standardized high content imaging\nprotocol that tags different cellular compartments per image channel. In order\nto judge the experimental outcome, the scientist requires knowledge about the\nchannel importance with respect to a certain phenotype for decoding the\nunderlying biology. In contrast to traditional image analysis approaches, such\nexperiments are nowadays preferably analyzed by deep learning based approaches\nwhich, however, lack crucial information about the channel importance. To\novercome this limitation, we present a novel approach which utilizes\nmulti-spectral information of high content images to interpret a certain aspect\nof cellular biology. To this end, we base our method on image blending concepts\nwith alpha compositing for an arbitrary number of channels. More specifically,\nwe introduce DCMIX, a lightweight, scaleable and end-to-end trainable mixing\nlayer which enables interpretable predictions in high content imaging while\nretaining the benefits of deep learning based methods. We employ an extensive\nset of experiments on both MNIST and RXRX1 datasets, demonstrating that DCMIX\nlearns the biologically relevant channel importance without scarifying\nprediction performance.\n","authors":["Daniel Siegismund","Mario Wieser","Stephan Heyse","Stephan Steigele"],"pdf_url":"https://arxiv.org/pdf/2308.16637v1.pdf","comment":"Accepted @ DAGM German Conference on Pattern Recognition (GCPR) 2023"},{"id":"http://arxiv.org/abs/2308.16635v1","updated":"2023-08-31T11:10:28Z","published":"2023-08-31T11:10:28Z","title":"MFR-Net: Multi-faceted Responsive Listening Head Generation via\n Denoising Diffusion Model","summary":" Face-to-face communication is a common scenario including roles of speakers\nand listeners. Most existing research methods focus on producing speaker\nvideos, while the generation of listener heads remains largely overlooked.\nResponsive listening head generation is an important task that aims to model\nface-to-face communication scenarios by generating a listener head video given\na speaker video and a listener head image. An ideal generated responsive\nlistening video should respond to the speaker with attitude or viewpoint\nexpressing while maintaining diversity in interaction patterns and accuracy in\nlistener identity information. To achieve this goal, we propose the\n\\textbf{M}ulti-\\textbf{F}aceted \\textbf{R}esponsive Listening Head Generation\nNetwork (MFR-Net). Specifically, MFR-Net employs the probabilistic denoising\ndiffusion model to predict diverse head pose and expression features. In order\nto perform multi-faceted response to the speaker video, while maintaining\naccurate listener identity preservation, we design the Feature Aggregation\nModule to boost listener identity features and fuse them with other\nspeaker-related features. Finally, a renderer finetuned with identity\nconsistency loss produces the final listening head videos. Our extensive\nexperiments demonstrate that MFR-Net not only achieves multi-faceted responses\nin diversity and speaker identity information but also in attitude and\nviewpoint expression.\n","authors":["Jin Liu","Xi Wang","Xiaomeng Fu","Yesheng Chai","Cai Yu","Jiao Dai","Jizhong Han"],"pdf_url":"https://arxiv.org/pdf/2308.16635v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.16633v1","updated":"2023-08-31T11:00:05Z","published":"2023-08-31T11:00:05Z","title":"Semi-Supervised SAR ATR Framework with Transductive Auxiliary\n Segmentation","summary":" Convolutional neural networks (CNNs) have achieved high performance in\nsynthetic aperture radar (SAR) automatic target recognition (ATR). However, the\nperformance of CNNs depends heavily on a large amount of training data. The\ninsufficiency of labeled training SAR images limits the recognition performance\nand even invalidates some ATR methods. Furthermore, under few labeled training\ndata, many existing CNNs are even ineffective. To address these challenges, we\npropose a Semi-supervised SAR ATR Framework with transductive Auxiliary\nSegmentation (SFAS). The proposed framework focuses on exploiting the\ntransductive generalization on available unlabeled samples with an auxiliary\nloss serving as a regularizer. Through auxiliary segmentation of unlabeled SAR\nsamples and information residue loss (IRL) in training, the framework can\nemploy the proposed training loop process and gradually exploit the information\ncompilation of recognition and segmentation to construct a helpful inductive\nbias and achieve high performance. Experiments conducted on the MSTAR dataset\nhave shown the effectiveness of our proposed SFAS for few-shot learning. The\nrecognition performance of 94.18\\% can be achieved under 20 training samples in\neach class with simultaneous accurate segmentation results. Facing variances of\nEOCs, the recognition ratios are higher than 88.00\\% when 10 training samples\neach class.\n","authors":["Chenwei Wang","Xiaoyu Liu","Yulin Huang","Siyi Luo","Jifang Pei","Jianyu Yang","Deqing Mao"],"pdf_url":"https://arxiv.org/pdf/2308.16633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16632v1","updated":"2023-08-31T11:00:03Z","published":"2023-08-31T11:00:03Z","title":"3D-STMN: Dependency-Driven Superpoint-Text Matching Network for\n End-to-End 3D Referring Expression Segmentation","summary":" In 3D Referring Expression Segmentation (3D-RES), the earlier approach adopts\na two-stage paradigm, extracting segmentation proposals and then matching them\nwith referring expressions. However, this conventional paradigm encounters\nsignificant challenges, most notably in terms of the generation of lackluster\ninitial proposals and a pronounced deceleration in inference speed. Recognizing\nthese limitations, we introduce an innovative end-to-end Superpoint-Text\nMatching Network (3D-STMN) that is enriched by dependency-driven insights. One\nof the keystones of our model is the Superpoint-Text Matching (STM) mechanism.\nUnlike traditional methods that navigate through instance proposals, STM\ndirectly correlates linguistic indications with their respective superpoints,\nclusters of semantically related points. This architectural decision empowers\nour model to efficiently harness cross-modal semantic relationships, primarily\nleveraging densely annotated superpoint-text pairs, as opposed to the more\nsparse instance-text pairs. In pursuit of enhancing the role of text in guiding\nthe segmentation process, we further incorporate the Dependency-Driven\nInteraction (DDI) module to deepen the network's semantic comprehension of\nreferring expressions. Using the dependency trees as a beacon, this module\ndiscerns the intricate relationships between primary terms and their associated\ndescriptors in expressions, thereby elevating both the localization and\nsegmentation capacities of our model. Comprehensive experiments on the\nScanRefer benchmark reveal that our model not only set new performance\nstandards, registering an mIoU gain of 11.7 points but also achieve a\nstaggering enhancement in inference speed, surpassing traditional methods by\n95.7 times. The code and models are available at\nhttps://github.com/sosppxo/3D-STMN.\n","authors":["Changli Wu","Yiwei Ma","Qi Chen","Haowei Wang","Gen Luo","Jiayi Ji","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2308.16632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16612v1","updated":"2023-08-31T10:19:23Z","published":"2023-08-31T10:19:23Z","title":"Neural Gradient Regularizer","summary":" Owing to its significant success, the prior imposed on gradient maps has\nconsistently been a subject of great interest in the field of image processing.\nTotal variation (TV), one of the most representative regularizers, is known for\nits ability to capture the sparsity of gradient maps. Nonetheless, TV and its\nvariants often underestimate the gradient maps, leading to the weakening of\nedges and details whose gradients should not be zero in the original image.\nRecently, total deep variation (TDV) has been introduced, assuming the sparsity\nof feature maps, which provides a flexible regularization learned from\nlarge-scale datasets for a specific task. However, TDV requires retraining when\nthe image or task changes, limiting its versatility. In this paper, we propose\na neural gradient regularizer (NGR) that expresses the gradient map as the\noutput of a neural network. Unlike existing methods, NGR does not rely on the\nsparsity assumption, thereby avoiding the underestimation of gradient maps. NGR\nis applicable to various image types and different image processing tasks,\nfunctioning in a zero-shot learning fashion, making it a versatile and\nplug-and-play regularizer. Extensive experimental results demonstrate the\nsuperior performance of NGR over state-of-the-art counterparts for a range of\ndifferent tasks, further validating its effectiveness and versatility.\n","authors":["Shuang Xu","Yifan Wang","Zixiang Zhao","Jiangjun Peng","Xiangyong Cao","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16611v1","updated":"2023-08-31T10:16:59Z","published":"2023-08-31T10:16:59Z","title":"Detecting Out-of-Context Image-Caption Pairs in News: A\n Counter-Intuitive Method","summary":" The growth of misinformation and re-contextualized media in social media and\nnews leads to an increasing need for fact-checking methods. Concurrently, the\nadvancement in generative models makes cheapfakes and deepfakes both easier to\nmake and harder to detect. In this paper, we present a novel approach using\ngenerative image models to our advantage for detecting Out-of-Context (OOC) use\nof images-caption pairs in news. We present two new datasets with a total of\n$6800$ images generated using two different generative models including (1)\nDALL-E 2, and (2) Stable-Diffusion. We are confident that the method proposed\nin this paper can further research on generative models in the field of\ncheapfake detection, and that the resulting datasets can be used to train and\nevaluate new models aimed at detecting cheapfakes. We run a preliminary\nqualitative and quantitative analysis to evaluate the performance of each image\ngeneration model for this task, and evaluate a handful of methods for computing\nimage similarity.\n","authors":["Eivind Moholdt","Sohail Ahmed Khan","Duc-Tien Dang-Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.16611v1.pdf","comment":"ACM International Conference on Content-Based Multimedia Indexing\n (CBMI '23)"},{"id":"http://arxiv.org/abs/2303.05800v2","updated":"2023-08-31T10:09:06Z","published":"2023-03-10T09:09:37Z","title":"Enhancing the accuracies by performing pooling decisions adjacent to the\n output layer","summary":" Learning classification tasks of (2^nx2^n) inputs typically consist of \\le n\n(2x2) max-pooling (MP) operators along the entire feedforward deep\narchitecture. Here we show, using the CIFAR-10 database, that pooling decisions\nadjacent to the last convolutional layer significantly enhance accuracies. In\nparticular, average accuracies of the advanced-VGG with m layers (A-VGGm)\narchitectures are 0.936, 0.940, 0.954, 0.955, and 0.955 for m=6, 8, 14, 13, and\n16, respectively. The results indicate A-VGG8s' accuracy is superior to\nVGG16s', and that the accuracies of A-VGG13 and A-VGG16 are equal, and\ncomparable to that of Wide-ResNet16. In addition, replacing the three fully\nconnected (FC) layers with one FC layer, A-VGG6 and A-VGG14, or with several\nlinear activation FC layers, yielded similar accuracies. These significantly\nenhanced accuracies stem from training the most influential input-output\nroutes, in comparison to the inferior routes selected following multiple MP\ndecisions along the deep architecture. In addition, accuracies are sensitive to\nthe order of the non-commutative MP and average pooling operators adjacent to\nthe output layer, varying the number and location of training routes. The\nresults call for the reexamination of previously proposed deep architectures\nand their accuracies by utilizing the proposed pooling strategy adjacent to the\noutput layer.\n","authors":["Yuval Meir","Yarden Tzach","Ronit D. Gross","Ofek Tevet","Roni Vardi","Ido Kanter"],"pdf_url":"https://arxiv.org/pdf/2303.05800v2.pdf","comment":"29 pages, 3 figures, 1 table, and Supplementary Information"},{"id":"http://arxiv.org/abs/2308.16598v1","updated":"2023-08-31T09:57:27Z","published":"2023-08-31T09:57:27Z","title":"Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation","summary":" Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential\nrole in the early diagnosis and treatment of liver cancer. Deep learning models\nbackboned by fully convolutional neural networks (FCNNs) have become the\ndominant model for segmenting 3D computerized tomography (CT) scans. However,\nsince their convolution layers suffer from limited kernel size, they are not\nable to capture long-range dependencies and global context. To tackle this\nrestriction, vision transformers have been introduced to solve FCNN's locality\nof receptive fields. Although transformers can capture long-range features,\ntheir segmentation performance decreases with various tumor sizes due to the\nmodel sensitivity to the input patch size. While finding an optimal patch size\nimproves the performance of vision transformer-based models on segmentation\ntasks, it is a time-consuming and challenging procedure. This paper proposes a\ntechnique to select the vision transformer's optimal input multi-resolution\nimage patch size based on the average volume size of metastasis lesions. We\nfurther validated our suggested framework using a transfer-learning technique,\ndemonstrating that the highest Dice similarity coefficient (DSC) performance\nwas obtained by pre-training on training data with a larger tumour volume using\nthe suggested ideal patch size and then training with a smaller one. We\nexperimentally evaluate this idea through pre-training our model on a\nmulti-resolution public dataset. Our model showed consistent and improved\nresults when applied to our private multi-resolution mCRC dataset with a\nsmaller average tumor volume. This study lays the groundwork for optimizing\nsemantic segmentation of small objects using vision transformers. The\nimplementation source code is available\nat:https://github.com/Ramtin-Mojtahedi/OVTPS.\n","authors":["Ramtin Mojtahedi","Mohammad Hamghalam","Richard K. G. Do","Amber L. Simpson"],"pdf_url":"https://arxiv.org/pdf/2308.16598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08396v4","updated":"2023-08-31T09:43:37Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with a\nnominal memory and computational burden. The inclusion of multi-axis\nself-attention, within each decoder stage, significantly enhances the\ndiscriminating capacity between the object and background regions, thereby\nhelping in improving the segmentation efficiency. In the Hybrid Decoder block,\nthe fusion process commences by integrating the upsampled lower-level decoder\nfeatures, obtained through transpose convolution, with the skip-connection\nfeatures derived from the hybrid encoder. Subsequently, the fused features\nundergo refinement through the utilization of a multi-axis attention mechanism.\nThe proposed decoder block is repeated multiple times to progressively segment\nthe nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset\ndemonstrates the effectiveness of the proposed technique. Our MaxViT-UNet\noutperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet)\ntechniques by a considerable margin on both of the standard datasets. The\nfollowing github (https://github.com/PRLAB21/MaxViT-UNet) contains the\nimplementation and trained weights.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v4.pdf","comment":"17 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.12751v2","updated":"2023-08-31T09:42:09Z","published":"2023-07-24T12:42:45Z","title":"ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised\n Real-world Single Image Super-Resolution","summary":" Single image super-resolution (SISR) is a challenging ill-posed problem that\naims to up-sample a given low-resolution (LR) image to a high-resolution (HR)\ncounterpart. Due to the difficulty in obtaining real LR-HR training pairs,\nrecent approaches are trained on simulated LR images degraded by simplified\ndown-sampling operators, e.g., bicubic. Such an approach can be problematic in\npractice because of the large gap between the synthesized and real-world LR\nimages. To alleviate the issue, we propose a novel Invertible scale-Conditional\nFunction (ICF), which can scale an input image and then restore the original\ninput with different scale conditions. By leveraging the proposed ICF, we\nconstruct a novel self-supervised SISR framework (ICF-SRSR) to handle the\nreal-world SR task without using any paired/unpaired training data.\nFurthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs,\nwhich can make existing supervised SISR networks more robust. Extensive\nexperiments demonstrate the effectiveness of the proposed method in handling\nSISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior\nperformance compared to the existing methods trained on synthetic paired images\nin real-world scenarios and exhibits comparable performance compared to\nstate-of-the-art supervised/unsupervised methods on public benchmark datasets.\n","authors":["Reyhaneh Neshatavar","Mohsen Yavartanoo","Sanghyun Son","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16582v1","updated":"2023-08-31T09:27:56Z","published":"2023-08-31T09:27:56Z","title":"Any-Size-Diffusion: Toward Efficient Text-Driven Synthesis for Any-Size\n HD Images","summary":" Stable diffusion, a generative model used in text-to-image synthesis,\nfrequently encounters resolution-induced composition problems when generating\nimages of varying sizes. This issue primarily stems from the model being\ntrained on pairs of single-scale images and their corresponding text\ndescriptions. Moreover, direct training on images of unlimited sizes is\nunfeasible, as it would require an immense number of text-image pairs and\nentail substantial computational expenses. To overcome these challenges, we\npropose a two-stage pipeline named Any-Size-Diffusion (ASD), designed to\nefficiently generate well-composed images of any size, while minimizing the\nneed for high-memory GPU resources. Specifically, the initial stage, dubbed Any\nRatio Adaptability Diffusion (ARAD), leverages a selected set of images with a\nrestricted range of ratios to optimize the text-conditional diffusion model,\nthereby improving its ability to adjust composition to accommodate diverse\nimage sizes. To support the creation of images at any desired size, we further\nintroduce a technique called Fast Seamless Tiled Diffusion (FSTD) at the\nsubsequent stage. This method allows for the rapid enlargement of the ASD\noutput to any high-resolution size, avoiding seaming artifacts or memory\noverloads. Experimental results on the LAION-COCO and MM-CelebA-HQ benchmarks\ndemonstrate that ASD can produce well-structured images of arbitrary sizes,\ncutting down the inference time by 2x compared to the traditional tiled\nalgorithm.\n","authors":["Qingping Zheng","Yuanfan Guo","Jiankang Deng","Jianhua Han","Ying Li","Songcen Xu","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.16582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12319v2","updated":"2023-08-31T09:26:57Z","published":"2023-08-23T11:31:38Z","title":"RemovalNet: DNN Fingerprint Removal Attacks","summary":" With the performance of deep neural networks (DNNs) remarkably improving,\nDNNs have been widely used in many areas. Consequently, the DNN model has\nbecome a valuable asset, and its intellectual property is safeguarded by\nownership verification techniques (e.g., DNN fingerprinting). However, the\nfeasibility of the DNN fingerprint removal attack and its potential influence\nremains an open problem. In this paper, we perform the first comprehensive\ninvestigation of DNN fingerprint removal attacks. Generally, the knowledge\ncontained in a DNN model can be categorized into general semantic and\nfingerprint-specific knowledge. To this end, we propose a min-max bilevel\noptimization-based DNN fingerprint removal attack named RemovalNet, to evade\nmodel ownership verification. The lower-level optimization is designed to\nremove fingerprint-specific knowledge. While in the upper-level optimization,\nwe distill the victim model's general semantic knowledge to maintain the\nsurrogate model's performance. We conduct extensive experiments to evaluate the\nfidelity, effectiveness, and efficiency of the RemovalNet against four advanced\ndefense methods on six metrics. The empirical results demonstrate that (1) the\nRemovalNet is effective. After our DNN fingerprint removal attack, the model\ndistance between the target and surrogate models is x100 times higher than that\nof the baseline attacks, (2) the RemovalNet is efficient. It uses only 0.2%\n(400 samples) of the substitute dataset and 1,000 iterations to conduct our\nattack. Besides, compared with advanced model stealing attacks, the RemovalNet\nsaves nearly 85% of computational resources at most, (3) the RemovalNet\nachieves high fidelity that the created surrogate model maintains high accuracy\nafter the DNN fingerprint removal process. Our code is available at:\nhttps://github.com/grasses/RemovalNet.\n","authors":["Hongwei Yao","Zheng Li","Kunzhe Huang","Jian Lou","Zhan Qin","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2308.12319v2.pdf","comment":"some mistake"},{"id":"http://arxiv.org/abs/2308.16576v1","updated":"2023-08-31T09:19:06Z","published":"2023-08-31T09:19:06Z","title":"GHuNeRF: Generalizable Human NeRF from a Monocular Video","summary":" In this paper, we tackle the challenging task of learning a generalizable\nhuman NeRF model from a monocular video. Although existing generalizable human\nNeRFs have achieved impressive results, they require muti-view images or videos\nwhich might not be always available. On the other hand, some works on\nfree-viewpoint rendering of human from monocular videos cannot be generalized\nto unseen identities. In view of these limitations, we propose GHuNeRF to learn\na generalizable human NeRF model from a monocular video of the human performer.\nWe first introduce a visibility-aware aggregation scheme to compute vertex-wise\nfeatures, which is used to construct a 3D feature volume. The feature volume\ncan only represent the overall geometry of the human performer with\ninsufficient accuracy due to the limited resolution. To solve this, we further\nenhance the volume feature with temporally aligned point-wise features using an\nattention mechanism. Finally, the enhanced feature is used for predicting\ndensity and color for each sampled point. A surface-guided sampling strategy is\nalso introduced to improve the efficiency for both training and inference. We\nvalidate our approach on the widely-used ZJU-MoCap dataset, where we achieve\ncomparable performance with existing multi-view video based approaches. We also\ntest on the monocular People-Snapshot dataset and achieve better performance\nthan existing works when only monocular video is used.\n","authors":["Chen Li","Jihao Lin","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2308.16576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16573v1","updated":"2023-08-31T09:13:34Z","published":"2023-08-31T09:13:34Z","title":"Dual-Decoder Consistency via Pseudo-Labels Guided Data Augmentation for\n Semi-Supervised Medical Image Segmentation","summary":" Medical image segmentation methods often rely on fully supervised approaches\nto achieve excellent performance, which is contingent upon having an extensive\nset of labeled images for training. However, annotating medical images is both\nexpensive and time-consuming. Semi-supervised learning offers a solution by\nleveraging numerous unlabeled images alongside a limited set of annotated ones.\nIn this paper, we introduce a semi-supervised medical image segmentation method\nbased on the mean-teacher model, referred to as Dual-Decoder Consistency via\nPseudo-Labels Guided Data Augmentation (DCPA). This method combines consistency\nregularization, pseudo-labels, and data augmentation to enhance the efficacy of\nsemi-supervised segmentation. Firstly, the proposed model comprises both\nstudent and teacher models with a shared encoder and two distinct decoders\nemploying different up-sampling strategies. Minimizing the output discrepancy\nbetween decoders enforces the generation of consistent representations, serving\nas regularization during student model training. Secondly, we introduce mixup\noperations to blend unlabeled data with labeled data, creating mixed data and\nthereby achieving data augmentation. Lastly, pseudo-labels are generated by the\nteacher model and utilized as labels for mixed data to compute unsupervised\nloss. We compare the segmentation results of the DCPA model with six\nstate-of-the-art semi-supervised methods on three publicly available medical\ndatasets. Beyond classical 10\\% and 20\\% semi-supervised settings, we\ninvestigate performance with less supervision (5\\% labeled data). Experimental\noutcomes demonstrate that our approach consistently outperforms existing\nsemi-supervised medical image segmentation methods across the three\nsemi-supervised settings.\n","authors":["Yuanbin Chen","Tao Wang","Hui Tang","Longxuan Zhao","Ruige Zong","Tao Tan","Xinlin Zhang","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2308.16573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16572v1","updated":"2023-08-31T09:13:30Z","published":"2023-08-31T09:13:30Z","title":"CL-MAE: Curriculum-Learned Masked Autoencoders","summary":" Masked image modeling has been demonstrated as a powerful pretext task for\ngenerating robust representations that can be effectively generalized across\nmultiple downstream tasks. Typically, this approach involves randomly masking\npatches (tokens) in input images, with the masking strategy remaining unchanged\nduring training. In this paper, we propose a curriculum learning approach that\nupdates the masking strategy to continually increase the complexity of the\nself-supervised reconstruction task. We conjecture that, by gradually\nincreasing the task complexity, the model can learn more sophisticated and\ntransferable representations. To facilitate this, we introduce a novel\nlearnable masking module that possesses the capability to generate masks of\ndifferent complexities, and integrate the proposed module into masked\nautoencoders (MAE). Our module is jointly trained with the MAE, while adjusting\nits behavior during training, transitioning from a partner to the MAE\n(optimizing the same reconstruction loss) to an adversary (optimizing the\nopposite loss), while passing through a neutral state. The transition between\nthese behaviors is smooth, being regulated by a factor that is multiplied with\nthe reconstruction loss of the masking module. The resulting training procedure\ngenerates an easy-to-hard curriculum. We train our Curriculum-Learned Masked\nAutoencoder (CL-MAE) on ImageNet and show that it exhibits superior\nrepresentation learning capabilities compared to MAE. The empirical results on\nfive downstream tasks confirm our conjecture, demonstrating that curriculum\nlearning can be successfully used to self-supervise masked autoencoders.\n","authors":["Neelu Madan","Nicolae-Catalin Ristea","Kamal Nasrollahi","Thomas B. Moeslund","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2308.16572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16571v1","updated":"2023-08-31T09:12:34Z","published":"2023-08-31T09:12:34Z","title":"Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based\n Approach","summary":" In the rapidly evolving digital era, the analysis of document layouts plays a\npivotal role in automated information extraction and interpretation. In our\nwork, we have trained MViTv2 transformer model architecture with cascaded mask\nR-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from\na document. After training on 20365 document images for 36 epochs in a 3 phase\ncycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work\nextends beyond training, delving into the exploration of potential enhancement\navenues. We investigate the impact of rotation and flip augmentation, the\neffectiveness of slicing input images pre-inference, the implications of\nvarying the resolution of the transformer backbone, and the potential of\nemploying a dual-pass inference to uncover missed text-boxes. Through these\nexplorations, we observe a spectrum of outcomes, where some modifications\nresult in tangible performance improvements, while others offer unique insights\nfor future endeavors.\n","authors":["Ashrafur Rahman Khan","Asif Azad"],"pdf_url":"https://arxiv.org/pdf/2308.16571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05970v2","updated":"2023-08-31T09:11:22Z","published":"2023-05-10T08:28:51Z","title":"FusionBooster: A Unified Image Fusion Boosting Paradigm","summary":" In recent years, numerous ideas have emerged for designing a mutually\nreinforcing mechanism or extra stages for the image fusion task, ignoring the\ninevitable gaps between different vision tasks and the computational burden. We\nargue that there is a scope to improve the fusion performance with the help of\nthe FusionBooster, a model specifically designed for the fusion task. In\nparticular, our booster is based on the divide-and-conquer strategy controlled\nby an information probe. The booster is composed of three building blocks: the\nprobe units, the booster layer, and the assembling module. Given the result\nproduced by a backbone method, the probe units assess the fused image and\ndivide the results according to their information content. This is instrumental\nin identifying missing information, as a step to its recovery. The recovery of\nthe degraded components along with the fusion guidance are the role of the\nbooster layer. Lastly, the assembling module is responsible for piecing these\nadvanced components together to deliver the output. We use concise\nreconstruction loss functions in conjunction with lightweight autoencoder\nmodels to formulate the learning task, with marginal computational complexity\nincrease. The experimental results obtained in various fusion tasks, as well as\ndownstream detection tasks, consistently demonstrate that the proposed\nFusionBooster significantly improves the performance. Our code will be publicly\navailable on the project homepage.\n","authors":["Chunyang Cheng","Tianyang Xu","Xiao-Jun Wu","Hui Li","Xi Li","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2305.05970v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2308.16568v1","updated":"2023-08-31T09:02:53Z","published":"2023-08-31T09:02:53Z","title":"Shape of my heart: Cardiac models through learned signed distance\n functions","summary":" The efficient construction of an anatomical model is one of the major\nchallenges of patient-specific in-silico models of the human heart. Current\nmethods frequently rely on linear statistical models, allowing no advanced\ntopological changes, or requiring medical image segmentation followed by a\nmeshing pipeline, which strongly depends on image resolution, quality, and\nmodality. These approaches are therefore limited in their transferability to\nother imaging domains. In this work, the cardiac shape is reconstructed by\nmeans of three-dimensional deep signed distance functions with Lipschitz\nregularity. For this purpose, the shapes of cardiac MRI reconstructions are\nlearned from public databases to model the spatial relation of multiple\nchambers in Cartesian space. We demonstrate that this approach is also capable\nof reconstructing anatomical models from partial data, such as point clouds\nfrom a single ventricle, or modalities different from the trained MRI, such as\nelectroanatomical mapping, and in addition, allows us to generate new\nanatomical shapes by randomly sampling latent vectors.\n","authors":["Jan Verhülsdonk","Thomas Grandits","Francisco Sahli Costabal","Rolf Krause","Angelo Auricchio","Gundolf Haase","Simone Pezzuto","Alexander Effland"],"pdf_url":"https://arxiv.org/pdf/2308.16568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16567v1","updated":"2023-08-31T09:01:45Z","published":"2023-08-31T09:01:45Z","title":"ScrollNet: Dynamic Weight Importance for Continual Learning","summary":" The principle underlying most existing continual learning (CL) methods is to\nprioritize stability by penalizing changes in parameters crucial to old tasks,\nwhile allowing for plasticity in other parameters. The importance of weights\nfor each task can be determined either explicitly through learning a\ntask-specific mask during training (e.g., parameter isolation-based approaches)\nor implicitly by introducing a regularization term (e.g., regularization-based\napproaches). However, all these methods assume that the importance of weights\nfor each task is unknown prior to data exposure. In this paper, we propose\nScrollNet as a scrolling neural network for continual learning. ScrollNet can\nbe seen as a dynamic network that assigns the ranking of weight importance for\neach task before data exposure, thus achieving a more favorable\nstability-plasticity tradeoff during sequential task learning by reassigning\nthis ranking for different tasks. Additionally, we demonstrate that ScrollNet\ncan be combined with various CL methods, including regularization-based and\nreplay-based approaches. Experimental results on CIFAR100 and TinyImagenet\ndatasets show the effectiveness of our proposed method. We release our code at\nhttps://github.com/FireFYF/ScrollNet.git.\n","authors":["Fei Yang","Kai Wang","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2308.16567v1.pdf","comment":"Accepted at Visual Continual Learning workshop (ICCV2023)"},{"id":"http://arxiv.org/abs/2303.00262v2","updated":"2023-08-31T09:01:35Z","published":"2023-03-01T06:35:42Z","title":"Collage Diffusion","summary":" We seek to give users precise control over diffusion-based image generation\nby modeling complex scenes as sequences of layers, which define the desired\nspatial arrangement and visual attributes of objects in the scene. Collage\nDiffusion harmonizes the input layers to make objects fit together -- the key\nchallenge involves minimizing changes in the positions and key visual\nattributes of the input layers while allowing other attributes to change in the\nharmonization process. We ensure that objects are generated in the correct\nlocations by modifying text-image cross-attention with the layers' alpha masks.\nWe preserve key visual attributes of input layers by learning specialized text\nrepresentations per layer and by extending ControlNet to operate on layers.\nLayer input allows users to control the extent of image harmonization on a\nper-object basis, and users can even iteratively edit individual objects in\ngenerated images while keeping other objects fixed. By leveraging the rich\ninformation present in layer input, Collage Diffusion generates globally\nharmonized images that maintain desired object characteristics better than\nprior approaches.\n","authors":["Vishnu Sarukkai","Linden Li","Arden Ma","Christopher Ré","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2303.00262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16561v1","updated":"2023-08-31T08:54:59Z","published":"2023-08-31T08:54:59Z","title":"MoMA: Momentum Contrastive Learning with Multi-head Attention-based\n Knowledge Distillation for Histopathology Image Analysis","summary":" There is no doubt that advanced artificial intelligence models and high\nquality data are the keys to success in developing computational pathology\ntools. Although the overall volume of pathology data keeps increasing, a lack\nof quality data is a common issue when it comes to a specific task due to\nseveral reasons including privacy and ethical issues with patient data. In this\nwork, we propose to exploit knowledge distillation, i.e., utilize the existing\nmodel to learn a new, target model, to overcome such issues in computational\npathology. Specifically, we employ a student-teacher framework to learn a\ntarget model from a pre-trained, teacher model without direct access to source\ndata and distill relevant knowledge via momentum contrastive learning with\nmulti-head attention mechanism, which provides consistent and context-aware\nfeature representations. This enables the target model to assimilate\ninformative representations of the teacher model while seamlessly adapting to\nthe unique nuances of the target data. The proposed method is rigorously\nevaluated across different scenarios where the teacher model was trained on the\nsame, relevant, and irrelevant classification tasks with the target model.\nExperimental results demonstrate the accuracy and robustness of our approach in\ntransferring knowledge to different domains and tasks, outperforming other\nrelated methods. Moreover, the results provide a guideline on the learning\nstrategy for different types of tasks and scenarios in computational pathology.\nCode is available at: \\url{https://github.com/trinhvg/MoMA}.\n","authors":["Trinh Thi Le Vuong","Jin Tae Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.16561v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.16555v1","updated":"2023-08-31T08:46:12Z","published":"2023-08-31T08:46:12Z","title":"E3CM: Epipolar-Constrained Cascade Correspondence Matching","summary":" Accurate and robust correspondence matching is of utmost importance for\nvarious 3D computer vision tasks. However, traditional explicit\nprogramming-based methods often struggle to handle challenging scenarios, and\ndeep learning-based methods require large well-labeled datasets for network\ntraining. In this article, we introduce Epipolar-Constrained Cascade\nCorrespondence (E3CM), a novel approach that addresses these limitations.\nUnlike traditional methods, E3CM leverages pre-trained convolutional neural\nnetworks to match correspondence, without requiring annotated data for any\nnetwork training or fine-tuning. Our method utilizes epipolar constraints to\nguide the matching process and incorporates a cascade structure for progressive\nrefinement of matches. We extensively evaluate the performance of E3CM through\ncomprehensive experiments and demonstrate its superiority over existing\nmethods. To promote further research and facilitate reproducibility, we make\nour source code publicly available at https://mias.group/E3CM.\n","authors":["Chenbo Zhou","Shuai Su","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2308.16555v1.pdf","comment":"accepted to Neurocomputing"},{"id":"http://arxiv.org/abs/2308.16552v1","updated":"2023-08-31T08:43:52Z","published":"2023-08-31T08:43:52Z","title":"Prompt-enhanced Hierarchical Transformer Elevating Cardiopulmonary\n Resuscitation Instruction via Temporal Action Segmentation","summary":" The vast majority of people who suffer unexpected cardiac arrest are\nperformed cardiopulmonary resuscitation (CPR) by passersby in a desperate\nattempt to restore life, but endeavors turn out to be fruitless on account of\ndisqualification. Fortunately, many pieces of research manifest that\ndisciplined training will help to elevate the success rate of resuscitation,\nwhich constantly desires a seamless combination of novel techniques to yield\nfurther advancement. To this end, we collect a custom CPR video dataset in\nwhich trainees make efforts to behave resuscitation on mannequins independently\nin adherence to approved guidelines, thereby devising an auxiliary toolbox to\nassist supervision and rectification of intermediate potential issues via\nmodern deep learning methodologies. Our research empirically views this problem\nas a temporal action segmentation (TAS) task in computer vision, which aims to\nsegment an untrimmed video at a frame-wise level. Here, we propose a\nPrompt-enhanced hierarchical Transformer (PhiTrans) that integrates three\nindispensable modules, including a textual prompt-based Video Features\nExtractor (VFE), a transformer-based Action Segmentation Executor (ASE), and a\nregression-based Prediction Refinement Calibrator (PRC). The backbone of the\nmodel preferentially derives from applications in three approved public\ndatasets (GTEA, 50Salads, and Breakfast) collected for TAS tasks, which\naccounts for the excavation of the segmentation pipeline on the CPR dataset. In\ngeneral, we unprecedentedly probe into a feasible pipeline that genuinely\nelevates the CPR instruction qualification via action segmentation in\nconjunction with cutting-edge deep learning techniques. Associated experiments\nadvocate our implementation with multiple metrics surpassing 91.0%.\n","authors":["Yang Liu","Xiaoyun Zhong","Shiyao Zhai","Zhicheng Du","Zhenyuan Gao","Qiming Huang","Canyang Zhang","Bin Jiang","Vijay Kumar Pandey","Sanyang Han","Runming Wang","Yuxing Han","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2308.16552v1.pdf","comment":"Transformer for Cardiopulmonary Resuscitation"},{"id":"http://arxiv.org/abs/2308.16551v1","updated":"2023-08-31T08:43:21Z","published":"2023-08-31T08:43:21Z","title":"Object Detection for Caries or Pit and Fissure Sealing Requirement in\n Children's First Permanent Molars","summary":" Dental caries is one of the most common oral diseases that, if left\nuntreated, can lead to a variety of oral problems. It mainly occurs inside the\npits and fissures on the occlusal/buccal/palatal surfaces of molars and\nchildren are a high-risk group for pit and fissure caries in permanent molars.\nPit and fissure sealing is one of the most effective methods that is widely\nused in prevention of pit and fissure caries. However, current detection of\npits and fissures or caries depends primarily on the experienced dentists,\nwhich ordinary parents do not have, and children may miss the remedial\ntreatment without timely detection. To address this issue, we present a method\nto autodetect caries and pit and fissure sealing requirements using oral photos\ntaken by smartphones. We use the YOLOv5 and YOLOX models and adopt a tiling\nstrategy to reduce information loss during image pre-processing. The best\nresult for YOLOXs model with tiling strategy is 72.3 mAP.5, while the best\nresult without tiling strategy is 71.2. YOLOv5s6 model with/without tiling\nattains 70.9/67.9 mAP.5, respectively. We deploy the pre-trained network to\nmobile devices as a WeChat applet, allowing in-home detection by parents or\nchildren guardian.\n","authors":["Chenyao Jiang","Shiyao Zhai","Hengrui Song","Yuqing Ma","Yachen Fan","Yancheng Fang","Dongmei Yu","Canyang Zhang","Sanyang Han","Runming Wang","Yong Liu","Jianbo Li","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2308.16551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05102v2","updated":"2023-08-31T08:43:17Z","published":"2023-03-09T08:21:50Z","title":"StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent\n Disentangled Space","summary":" One major challenge in machine learning applications is coping with\nmismatches between the datasets used in the development and those obtained in\nreal-world applications. These mismatches may lead to inaccurate predictions\nand errors, resulting in poor product quality and unreliable systems. In this\nstudy, we propose StyleDiff to inform developers of the differences between the\ntwo datasets for the steady development of machine learning systems. Using\ndisentangled image spaces obtained from recently proposed generative models,\nStyleDiff compares the two datasets by focusing on attributes in the images and\nprovides an easy-to-understand analysis of the differences between the\ndatasets. The proposed StyleDiff performs in $O (d N\\log N)$, where $N$ is the\nsize of the datasets and $d$ is the number of attributes, enabling the\napplication to large datasets. We demonstrate that StyleDiff accurately detects\ndifferences between datasets and presents them in an understandable format\nusing, for example, driving scenes datasets.\n","authors":["Keisuke Kawano","Takuro Kutsuna","Ryoko Tokuhisa","Akihiro Nakamura","Yasushi Esaki"],"pdf_url":"https://arxiv.org/pdf/2303.05102v2.pdf","comment":"25 pages, 17 figures, Image and Vision Computing"},{"id":"http://arxiv.org/abs/2308.16532v1","updated":"2023-08-31T08:21:29Z","published":"2023-08-31T08:21:29Z","title":"Decoupled Local Aggregation for Point Cloud Learning","summary":" The unstructured nature of point clouds demands that local aggregation be\nadaptive to different local structures. Previous methods meet this by\nexplicitly embedding spatial relations into each aggregation process. Although\nthis coupled approach has been shown effective in generating clear semantics,\naggregation can be greatly slowed down due to repeated relation learning and\nredundant computation to mix directional and point features. In this work, we\npropose to decouple the explicit modelling of spatial relations from local\naggregation. We theoretically prove that basic neighbor pooling operations can\ntoo function without loss of clarity in feature fusion, so long as essential\nspatial information has been encoded in point features. As an instantiation of\ndecoupled local aggregation, we present DeLA, a lightweight point network,\nwhere in each learning stage relative spatial encodings are first formed, and\nonly pointwise convolutions plus edge max-pooling are used for local\naggregation then. Further, a regularization term is employed to reduce\npotential ambiguity through the prediction of relative coordinates.\nConceptually simple though, experimental results on five classic benchmarks\ndemonstrate that DeLA achieves state-of-the-art performance with reduced or\ncomparable latency. Specifically, DeLA achieves over 90\\% overall accuracy on\nScanObjectNN and 74\\% mIoU on S3DIS Area 5. Our code is available at\nhttps://github.com/Matrix-ASC/DeLA .\n","authors":["Binjie Chen","Yunzhou Xia","Yu Zang","Cheng Wang","Jonathan Li"],"pdf_url":"https://arxiv.org/pdf/2308.16532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16530v1","updated":"2023-08-31T08:21:09Z","published":"2023-08-31T08:21:09Z","title":"Privacy-Preserving Medical Image Classification through Deep Learning\n and Matrix Decomposition","summary":" Deep learning (DL)-based solutions have been extensively researched in the\nmedical domain in recent years, enhancing the efficacy of diagnosis, planning,\nand treatment. Since the usage of health-related data is strictly regulated,\nprocessing medical records outside the hospital environment for developing and\nusing DL models demands robust data protection measures. At the same time, it\ncan be challenging to guarantee that a DL solution delivers a minimum level of\nperformance when being trained on secured data, without being specifically\ndesigned for the given task. Our approach uses singular value decomposition\n(SVD) and principal component analysis (PCA) to obfuscate the medical images\nbefore employing them in the DL analysis. The capability of DL algorithms to\nextract relevant information from secured data is assessed on a task of\nangiographic view classification based on obfuscated frames. The security level\nis probed by simulated artificial intelligence (AI)-based reconstruction\nattacks, considering two threat actors with different prior knowledge of the\ntargeted data. The degree of privacy is quantitatively measured using\nsimilarity indices. Although a trade-off between privacy and accuracy should be\nconsidered, the proposed technique allows for training the angiographic view\nclassifier exclusively on secured data with satisfactory performance and with\nno computational overhead, model adaptation, or hyperparameter tuning. While\nthe obfuscated medical image content is well protected against human\nperception, the hypothetical reconstruction attack proved that it is also\ndifficult to recover the complete information of the original frames.\n","authors":["Andreea Bianca Popescu","Cosmin Ioan Nita","Ioana Antonia Taca","Anamaria Vizitiu","Lucian Mihai Itu"],"pdf_url":"https://arxiv.org/pdf/2308.16530v1.pdf","comment":"6 pages, 9 figures, Published in: 2023 31st Mediterranean Conference\n on Control and Automation (MED)"},{"id":"http://arxiv.org/abs/2308.16528v1","updated":"2023-08-31T08:19:26Z","published":"2023-08-31T08:19:26Z","title":"SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded\n Objects","summary":" To enable meaningful robotic manipulation of objects in the real-world, 6D\npose estimation is one of the critical aspects. Most existing approaches have\ndifficulties to extend predictions to scenarios where novel object instances\nare continuously introduced, especially with heavy occlusions. In this work, we\npropose a few-shot pose estimation (FSPE) approach called SA6D, which uses a\nself-adaptive segmentation module to identify the novel target object and\nconstruct a point cloud model of the target object using only a small number of\ncluttered reference images. Unlike existing methods, SA6D does not require\nobject-centric reference images or any additional object information, making it\na more generalizable and scalable solution across categories. We evaluate SA6D\non real-world tabletop object datasets and demonstrate that SA6D outperforms\nexisting FSPE methods, particularly in cluttered scenes with occlusions, while\nrequiring fewer reference images.\n","authors":["Ning Gao","Ngo Anh Vien","Hanna Ziesche","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08566v2","updated":"2023-08-31T08:17:57Z","published":"2023-03-15T12:34:24Z","title":"Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning","summary":" Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful\nalternative for full fine-tuning so as to adapt pre-trained vision models to\ndownstream tasks, which only tunes a small number of parameters while freezing\nthe vast majority ones to ease storage burden and optimization difficulty.\nHowever, existing PEFT methods introduce trainable parameters to the same\npositions across different tasks depending solely on human heuristics and\nneglect the domain gaps. To this end, we study where to introduce and how to\nallocate trainable parameters by proposing a novel Sensitivity-aware visual\nParameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates\ntrainable parameters to task-specific important positions given a desired\ntunable parameter budget. Specifically, our SPT first quickly identifies the\nsensitive parameters that require tuning for a given task in a data-dependent\nway. Next, our SPT further boosts the representational capability for the\nweight matrices whose number of sensitive parameters exceeds a pre-defined\nthreshold by utilizing existing structured tuning methods, e.g., LoRA [23] or\nAdapter [22], to replace directly tuning the selected sensitive parameters\n(unstructured tuning) under the budget. Extensive experiments on a wide range\nof downstream recognition tasks show that our SPT is complementary to the\nexisting PEFT methods and largely boosts their performance, e.g., SPT improves\nAdapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean\nTop-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks,\nrespectively. Source code is at https://github.com/ziplab/SPT\n","authors":["Haoyu He","Jianfei Cai","Jing Zhang","Dacheng Tao","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2303.08566v2.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2308.16527v1","updated":"2023-08-31T08:17:29Z","published":"2023-08-31T08:17:29Z","title":"Unsupervised Recognition of Unknown Objects for Open-World Object\n Detection","summary":" Open-World Object Detection (OWOD) extends object detection problem to a\nrealistic and dynamic scenario, where a detection model is required to be\ncapable of detecting both known and unknown objects and incrementally learning\nnewly introduced knowledge. Current OWOD models, such as ORE and OW-DETR, focus\non pseudo-labeling regions with high objectness scores as unknowns, whose\nperformance relies heavily on the supervision of known objects. While they can\ndetect the unknowns that exhibit similar features to the known objects, they\nsuffer from a severe label bias problem that they tend to detect all regions\n(including unknown object regions) that are dissimilar to the known objects as\npart of the background. To eliminate the label bias, this paper proposes a\nnovel approach that learns an unsupervised discriminative model to recognize\ntrue unknown objects from raw pseudo labels generated by unsupervised region\nproposal methods. The resulting model can be further refined by a\nclassification-free self-training method which iteratively extends pseudo\nunknown objects to the unlabeled regions. Experimental results show that our\nmethod 1) significantly outperforms the prior SOTA in detecting unknown objects\nwhile maintaining competitive performance of detecting known object classes on\nthe MS COCO dataset, and 2) achieves better generalization ability on the LVIS\nand Objects365 datasets.\n","authors":["Ruohuan Fang","Guansong Pang","Lei Zhou","Xiao Bai","Jin Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.16527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15816v2","updated":"2023-08-31T08:14:17Z","published":"2023-08-30T07:41:26Z","title":"Improving Underwater Visual Tracking With a Large Scale Dataset and\n Image Enhancement","summary":" This paper presents a new dataset and general tracker enhancement method for\nUnderwater Visual Object Tracking (UVOT). Despite its significance, underwater\ntracking has remained unexplored due to data inaccessibility. It poses distinct\nchallenges; the underwater environment exhibits non-uniform lighting\nconditions, low visibility, lack of sharpness, low contrast, camouflage, and\nreflections from suspended particles. Performance of traditional tracking\nmethods designed primarily for terrestrial or open-air scenarios drops in such\nconditions. We address the problem by proposing a novel underwater image\nenhancement algorithm designed specifically to boost tracking quality. The\nmethod has resulted in a significant performance improvement, of up to 5.0%\nAUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate\nUVOT methods, large-scale datasets are required. To this end, we introduce a\nlarge-scale UVOT benchmark dataset consisting of 400 video segments and 275,000\nmanually annotated frames enabling underwater training and evaluation of deep\ntrackers. The videos are labelled with several underwater-specific tracking\nattributes including watercolor variation, target distractors, camouflage,\ntarget relative size, and low visibility conditions. The UVOT400 dataset,\ntracking results, and the code are publicly available on:\nhttps://github.com/BasitAlawode/UWVOT400.\n","authors":["Basit Alawode","Fayaz Ali Dharejo","Mehnaz Ummar","Yuhang Guo","Arif Mahmood","Naoufel Werghi","Fahad Shahbaz Khan","Jiri Matas","Sajid Javed"],"pdf_url":"https://arxiv.org/pdf/2308.15816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16518v1","updated":"2023-08-31T08:03:25Z","published":"2023-08-31T08:03:25Z","title":"MS23D: A 3D Object Detection Method Using Multi-Scale Semantic Feature\n Points to Construct 3D Feature Layers","summary":" Lidar point clouds, as a type of data with accurate distance perception, can\neffectively represent the motion and posture of objects in three-dimensional\nspace. However, the sparsity and disorderliness of point clouds make it\nchallenging to extract features directly from them. Many studies have addressed\nthis issue by transforming point clouds into regular voxel representations.\nHowever, these methods often lead to the loss of fine-grained local feature\ninformation due to downsampling. Moreover, the sparsity of point clouds poses\ndifficulties in efficiently aggregating features in 3D feature layers using\nvoxel-based two-stage methods. To address these issues, this paper proposes a\ntwo-stage 3D detection framework called MS$^{2}$3D. In MS$^{2}$3D, we utilize\nsmall-sized voxels to extract fine-grained local features and large-sized\nvoxels to capture long-range local features. Additionally, we propose a method\nfor constructing 3D feature layers using multi-scale semantic feature points,\nenabling the transformation of sparse 3D feature layers into more compact\nrepresentations. Furthermore, we compute the offset between feature points in\nthe 3D feature layers and the centroid of objects, aiming to bring them as\nclose as possible to the object's center. It significantly enhances the\nefficiency of feature aggregation. To validate the effectiveness of our method,\nwe evaluated our method on the KITTI dataset and ONCE dataset together.\n","authors":["Yongxin Shao","Aihong Tan","Tianhong Yan","Zhetao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.16518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06791v2","updated":"2023-08-31T07:49:41Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D\n Object Detector","summary":" LIDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, inference in real-time from extremely sparse 3D data poses a\nformidable challenge. To address this issue, a common approach is to project\npoint clouds onto a bird's-eye or perspective view, effectively converting them\ninto an image-like data format. However, this excessive compression of point\ncloud data often leads to the loss of information. This paper proposes a 3D\nobject detector based on voxel and projection double branch feature extraction\n(PV-SSD) to address the problem of information loss. We add voxel features\ninput containing rich local semantic information, which is fully fused with the\nprojected features in the feature extraction stage to reduce the local\ninformation loss caused by projection. A good performance is achieved compared\nto the previous work. In addition, this paper makes the following\ncontributions: 1) a voxel feature extraction method with variable receptive\nfields is proposed; 2) a feature point sampling method by weight sampling is\nused to filter out the feature points that are more conducive to the detection\ntask; 3) the MSSFA module is proposed based on the SSFA module. To verify the\neffectiveness of our method, we designed comparison experiments.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan"],"pdf_url":"https://arxiv.org/pdf/2308.06791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16512v1","updated":"2023-08-31T07:49:06Z","published":"2023-08-31T07:49:06Z","title":"MVDream: Multi-view Diffusion for 3D Generation","summary":" We propose MVDream, a multi-view diffusion model that is able to generate\ngeometrically consistent multi-view images from a given text prompt. By\nleveraging image diffusion models pre-trained on large-scale web datasets and a\nmulti-view dataset rendered from 3D assets, the resulting multi-view diffusion\nmodel can achieve both the generalizability of 2D diffusion and the consistency\nof 3D data. Such a model can thus be applied as a multi-view prior for 3D\ngeneration via Score Distillation Sampling, where it greatly improves the\nstability of existing 2D-lifting methods by solving the 3D consistency problem.\nFinally, we show that the multi-view diffusion model can also be fine-tuned\nunder a few shot setting for personalized 3D generation, i.e. DreamBooth3D\napplication, where the consistency can be maintained after learning the subject\nidentity.\n","authors":["Yichun Shi","Peng Wang","Jianglong Ye","Mai Long","Kejie Li","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16512v1.pdf","comment":"Our project page is https://MV-Dream.github.io"},{"id":"http://arxiv.org/abs/2308.16510v1","updated":"2023-08-31T07:47:11Z","published":"2023-08-31T07:47:11Z","title":"Robust GAN inversion","summary":" Recent advancements in real image editing have been attributed to the\nexploration of Generative Adversarial Networks (GANs) latent space. However,\nthe main challenge of this procedure is GAN inversion, which aims to map the\nimage to the latent space accurately. Existing methods that work on extended\nlatent space $W+$ are unable to achieve low distortion and high editability\nsimultaneously. To address this issue, we propose an approach which works in\nnative latent space $W$ and tunes the generator network to restore missing\nimage details. We introduce a novel regularization strategy with learnable\ncoefficients obtained by training randomized StyleGAN 2 model - WRanGAN. This\nmethod outperforms traditional approaches in terms of reconstruction quality\nand computational efficiency, achieving the lowest distortion with 4 times\nfewer parameters. Furthermore, we observe a slight improvement in the quality\nof constructing hyperplanes corresponding to binary image attributes. We\ndemonstrate the effectiveness of our approach on two complex datasets:\nFlickr-Faces-HQ and LSUN Church.\n","authors":["Egor Sevriugov","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2308.16510v1.pdf","comment":"22 pages, 28 figures"},{"id":"http://arxiv.org/abs/2308.09331v2","updated":"2023-08-31T07:45:59Z","published":"2023-08-18T06:26:22Z","title":"SAMedOCT: Adapting Segment Anything Model (SAM) for Retinal OCT","summary":" The Segment Anything Model (SAM) has gained significant attention in the\nfield of image segmentation due to its impressive capabilities and prompt-based\ninterface. While SAM has already been extensively evaluated in various domains,\nits adaptation to retinal OCT scans remains unexplored. To bridge this research\ngap, we conduct a comprehensive evaluation of SAM and its adaptations on a\nlarge-scale public dataset of OCTs from RETOUCH challenge. Our evaluation\ncovers diverse retinal diseases, fluid compartments, and device vendors,\ncomparing SAM against state-of-the-art retinal fluid segmentation methods.\nThrough our analysis, we showcase adapted SAM's efficacy as a powerful\nsegmentation model in retinal OCT scans, although still lagging behind\nestablished methods in some circumstances. The findings highlight SAM's\nadaptability and robustness, showcasing its utility as a valuable tool in\nretinal OCT image analysis and paving the way for further advancements in this\ndomain.\n","authors":["Botond Fazekas","José Morano","Dmitrii Lachinov","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2308.09331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10161v2","updated":"2023-08-31T07:38:50Z","published":"2023-08-20T04:34:30Z","title":"ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under\n Challenging Conditions","summary":" Robust 3D object detection in extreme weather and illumination conditions is\na challenging task. While radars and thermal cameras are known for their\nresilience to these conditions, few studies have been conducted on\nradar-thermal fusion due to the lack of corresponding datasets. To address this\ngap, we first present a new multi-modal dataset called ThermRad, which includes\na 3D LiDAR, a 4D radar, an RGB camera and a thermal camera. This dataset is\nunique because it includes data from all four sensors in extreme weather\nconditions, providing a valuable resource for future research in this area. To\nvalidate the robustness of 4D radars and thermal cameras for 3D object\ndetection in challenging weather conditions, we propose a new multi-modal\nfusion method called RTDF-RCNN, which leverages the complementary strengths of\n4D radars and thermal cameras to boost object detection performance. To further\nprove the effectiveness of our proposed framework, we re-implement\nstate-of-the-art (SOTA) 3D detectors on our dataset as benchmarks for\nevaluation. Our method achieves significant enhancements in detecting cars,\npedestrians, and cyclists, with improvements of over 7.98%, 24.27%, and 27.15%,\nrespectively, while achieving comparable results to LiDAR-based approaches. Our\ncontributions in both the ThermRad dataset and the new multi-modal fusion\nmethod provide a new approach to robust 3D object detection in adverse weather\nand illumination conditions. The ThermRad dataset will be released.\n","authors":["Qiao Yan","Yihan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10161v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.06681v3","updated":"2023-08-31T07:26:59Z","published":"2023-03-12T14:54:22Z","title":"Learning Deep Intensity Field for Extremely Sparse-View CBCT\n Reconstruction","summary":" Sparse-view cone-beam CT (CBCT) reconstruction is an important direction to\nreduce radiation dose and benefit clinical applications. Previous voxel-based\ngeneration methods represent the CT as discrete voxels, resulting in high\nmemory requirements and limited spatial resolution due to the use of 3D\ndecoders. In this paper, we formulate the CT volume as a continuous intensity\nfield and develop a novel DIF-Net to perform high-quality CBCT reconstruction\nfrom extremely sparse (fewer than 10) projection views at an ultrafast speed.\nThe intensity field of a CT can be regarded as a continuous function of 3D\nspatial points. Therefore, the reconstruction can be reformulated as regressing\nthe intensity value of an arbitrary 3D point from given sparse projections.\nSpecifically, for a point, DIF-Net extracts its view-specific features from\ndifferent 2D projection views. These features are subsequently aggregated by a\nfusion module for intensity estimation. Notably, thousands of points can be\nprocessed in parallel to improve efficiency during training and testing. In\npractice, we collect a knee CBCT dataset to train and evaluate DIF-Net.\nExtensive experiments show that our approach can reconstruct CBCT with high\nimage quality and high spatial resolution from extremely sparse views within\n1.6 seconds, significantly outperforming state-of-the-art methods. Our code\nwill be available at https://github.com/xmed-lab/DIF-Net.\n","authors":["Yiqun Lin","Zhongjin Luo","Wei Zhao","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2303.06681v3.pdf","comment":"MICCAI'23"},{"id":"http://arxiv.org/abs/2308.16139v2","updated":"2023-08-31T07:26:50Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Arcot Sowmya","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.10280v2","updated":"2023-08-31T07:23:56Z","published":"2023-08-20T14:27:28Z","title":"MacFormer: Map-Agent Coupled Transformer for Real-time and Robust\n Trajectory Prediction","summary":" Predicting the future behavior of agents is a fundamental task in autonomous\nvehicle domains. Accurate prediction relies on comprehending the surrounding\nmap, which significantly regularizes agent behaviors. However, existing methods\nhave limitations in exploiting the map and exhibit a strong dependence on\nhistorical trajectories, which yield unsatisfactory prediction performance and\nrobustness. Additionally, their heavy network architectures impede real-time\napplications. To tackle these problems, we propose Map-Agent Coupled\nTransformer (MacFormer) for real-time and robust trajectory prediction. Our\nframework explicitly incorporates map constraints into the network via two\ncarefully designed modules named coupled map and reference extractor. A novel\nmulti-task optimization strategy (MTOS) is presented to enhance learning of\ntopology and rule constraints. We also devise bilateral query scheme in context\nfusion for a more efficient and lightweight network. We evaluated our approach\non Argoverse 1, Argoverse 2, and nuScenes real-world benchmarks, where it all\nachieved state-of-the-art performance with the lowest inference latency and\nsmallest model size. Experiments also demonstrate that our framework is\nresilient to imperfect tracklet inputs. Furthermore, we show that by combining\nwith our proposed strategies, classical models outperform their baselines,\nfurther validating the versatility of our framework.\n","authors":["Chen Feng","Hangning Zhou","Huadong Lin","Zhigang Zhang","Ziyao Xu","Chi Zhang","Boyu Zhou","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2308.10280v2.pdf","comment":"Accepted by IEEE Robotics and Automation Letters. 8 Pages, 9 Figures,\n 9 Tables. Video: https://www.youtube.com/watch?v=XY388iI6sPQ"},{"id":"http://arxiv.org/abs/2305.15777v2","updated":"2023-08-31T07:20:34Z","published":"2023-05-25T06:44:43Z","title":"Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation","summary":" Medical image data are often limited due to the expensive acquisition and\nannotation process. Hence, training a deep-learning model with only raw data\ncan easily lead to overfitting. One solution to this problem is to augment the\nraw data with various transformations, improving the model's ability to\ngeneralize to new data. However, manually configuring a generic augmentation\ncombination and parameters for different datasets is non-trivial due to\ninconsistent acquisition approaches and data distributions. Therefore,\nautomatic data augmentation is proposed to learn favorable augmentation\nstrategies for different datasets while incurring large GPU overhead. To this\nend, we present a novel method, called Dynamic Data Augmentation (DDAug), which\nis efficient and has negligible computation cost. Our DDAug develops a\nhierarchical tree structure to represent various augmentations and utilizes an\nefficient Monte-Carlo tree searching algorithm to update, prune, and sample the\ntree. As a result, the augmentation pipeline can be optimized for each dataset\nautomatically. Experiments on multiple Prostate MRI datasets show that our\nmethod outperforms the current state-of-the-art data augmentation strategies.\n","authors":["Xinyue Xu","Yuhan Hsi","Haonan Wang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2305.15777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16490v1","updated":"2023-08-31T06:52:43Z","published":"2023-08-31T06:52:43Z","title":"Latent Painter","summary":" Latent diffusers revolutionized the generative AI and inspired creative art.\nWhen denoising the latent, the predicted original image at each step\ncollectively animates the formation. However, the animation is limited by the\ndenoising nature of the diffuser, and only renders a sharpening process. This\nwork presents Latent Painter, which uses the latent as the canvas, and the\ndiffuser predictions as the plan, to generate painting animation. Latent\nPainter also transits one generated image to another, which can happen between\nimages from two different sets of checkpoints.\n","authors":["Shih-Chieh Su"],"pdf_url":"https://arxiv.org/pdf/2308.16490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16486v1","updated":"2023-08-31T06:45:56Z","published":"2023-08-31T06:45:56Z","title":"Illumination Distillation Framework for Nighttime Person\n Re-Identification and A New Benchmark","summary":" Nighttime person Re-ID (person re-identification in the nighttime) is a very\nimportant and challenging task for visual surveillance but it has not been\nthoroughly investigated. Under the low illumination condition, the performance\nof person Re-ID methods usually sharply deteriorates. To address the low\nillumination challenge in nighttime person Re-ID, this paper proposes an\nIllumination Distillation Framework (IDF), which utilizes illumination\nenhancement and illumination distillation schemes to promote the learning of\nRe-ID models. Specifically, IDF consists of a master branch, an illumination\nenhancement branch, and an illumination distillation module. The master branch\nis used to extract the features from a nighttime image. The illumination\nenhancement branch first estimates an enhanced image from the nighttime image\nusing a nonlinear curve mapping method and then extracts the enhanced features.\nHowever, nighttime and enhanced features usually contain data noise due to\nunstable lighting conditions and enhancement failures. To fully exploit the\ncomplementary benefits of nighttime and enhanced features while suppressing\ndata noise, we propose an illumination distillation module. In particular, the\nillumination distillation module fuses the features from two branches through a\nbottleneck fusion model and then uses the fused features to guide the learning\nof both branches in a distillation manner. In addition, we build a real-world\nnighttime person Re-ID dataset, named Night600, which contains 600 identities\ncaptured from different viewpoints and nighttime illumination conditions under\ncomplex outdoor environments. Experimental results demonstrate that our IDF can\nachieve state-of-the-art performance on two nighttime person Re-ID datasets\n(i.e., Night600 and Knight ). We will release our code and dataset at\nhttps://github.com/Alexadlu/IDF.\n","authors":["Andong Lu","Zhang Zhang","Yan Huang","Yifan Zhang","Chenglong Li","Jin Tang","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16486v1.pdf","comment":"Accepted by TMM"},{"id":"http://arxiv.org/abs/2308.16484v1","updated":"2023-08-31T06:44:59Z","published":"2023-08-31T06:44:59Z","title":"Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning","summary":" Affordable 3D scanners often produce sparse and non-uniform point clouds that\nnegatively impact downstream applications in robotic systems. While existing\npoint cloud upsampling architectures have demonstrated promising results on\nstandard benchmarks, they tend to experience significant performance drops when\nthe test data have different distributions from the training data. To address\nthis issue, this paper proposes a test-time adaption approach to enhance model\ngenerality of point cloud upsampling. The proposed approach leverages\nmeta-learning to explicitly learn network parameters for test-time adaption.\nOur method does not require any prior information about the test data. During\nmeta-training, the model parameters are learned from a collection of\ninstance-level tasks, each of which consists of a sparse-dense pair of point\nclouds from the training data. During meta-testing, the trained model is\nfine-tuned with a few gradient updates to produce a unique set of network\nparameters for each test instance. The updated model is then used for the final\nprediction. Our framework is generic and can be applied in a plug-and-play\nmanner with existing backbone networks in point cloud upsampling. Extensive\nexperiments demonstrate that our approach improves the performance of\nstate-of-the-art models.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16481v1","updated":"2023-08-31T06:32:11Z","published":"2023-08-31T06:32:11Z","title":"Point-TTA: Test-Time Adaptation for Point Cloud Registration Using\n Multitask Meta-Auxiliary Learning","summary":" We present Point-TTA, a novel test-time adaptation framework for point cloud\nregistration (PCR) that improves the generalization and the performance of\nregistration models. While learning-based approaches have achieved impressive\nprogress, generalization to unknown testing environments remains a major\nchallenge due to the variations in 3D scans. Existing methods typically train a\ngeneric model and the same trained model is applied on each instance during\ntesting. This could be sub-optimal since it is difficult for the same model to\nhandle all the variations during testing. In this paper, we propose a test-time\nadaptation approach for PCR. Our model can adapt to unseen distributions at\ntest-time without requiring any prior knowledge of the test data. Concretely,\nwe design three self-supervised auxiliary tasks that are optimized jointly with\nthe primary PCR task. Given a test instance, we adapt our model using these\nauxiliary tasks and the updated model is used to perform the inference. During\ntraining, our model is trained using a meta-auxiliary learning approach, such\nthat the adapted model via auxiliary tasks improves the accuracy of the primary\ntask. Experimental results demonstrate the effectiveness of our approach in\nimproving generalization of point cloud registration and outperforming other\nstate-of-the-art approaches.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15142v2","updated":"2023-08-31T05:48:06Z","published":"2023-06-27T02:03:46Z","title":"LRANet: Towards Accurate and Efficient Scene Text Detection with\n Low-Rank Approximation Network","summary":" Recently, regression-based methods, which predict parameterized text shapes\nfor text localization, have gained popularity in scene text detection. However,\nthe existing parameterized text shape methods still have limitations in\nmodeling arbitrary-shaped texts due to ignoring the utilization of\ntext-specific shape information. Moreover, the time consumption of the entire\npipeline has been largely overlooked, leading to a suboptimal overall inference\nspeed. To address these issues, we first propose a novel parameterized text\nshape method based on low-rank approximation. Unlike other shape representation\nmethods that employ data-irrelevant parameterization, our approach utilizes\nsingular value decomposition and reconstructs the text shape using a few\neigenvectors learned from labeled text contours. By exploring the shape\ncorrelation among different text contours, our method achieves consistency,\ncompactness, simplicity, and robustness in shape representation. Next, we\npropose a dual assignment scheme for speed acceleration. It adopts a sparse\nassignment branch to accelerate the inference speed, and meanwhile, provides\nample supervised signals for training through a dense assignment branch.\nBuilding upon these designs, we implement an accurate and efficient\narbitrary-shaped text detector named LRANet. Extensive experiments are\nconducted on several challenging benchmarks, demonstrating the superior\naccuracy and efficiency of LRANet compared to state-of-the-art methods. Code\nwill be released soon.\n","authors":["Yuchen Su","Zhineng Chen","Zhiwen Shao","Yuning Du","Zhilong Ji","Jinfeng Bai","Yong Zhou","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2306.15142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16477v1","updated":"2023-08-31T05:43:46Z","published":"2023-08-31T05:43:46Z","title":"PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction","summary":" Vectorized high-definition map online construction has garnered considerable\nattention in the field of autonomous driving research. Most existing approaches\nmodel changeable map elements using a fixed number of points, or predict local\nmaps in a two-stage autoregressive manner, which may miss essential details and\nlead to error accumulation. Towards precise map element learning, we propose a\nsimple yet effective architecture named PivotNet, which adopts unified\npivot-based map representations and is formulated as a direct set prediction\nparadigm. Concretely, we first propose a novel Point-to-Line Mask module to\nencode both the subordinate and geometrical point-line priors in the network.\nThen, a well-designed Pivot Dynamic Matching module is proposed to model the\ntopology in dynamic point sequences by introducing the concept of sequence\nmatching. Furthermore, to supervise the position and topology of the vectorized\npoint predictions, we propose a Dynamic Vectorized Sequence loss. Extensive\nexperiments and ablations show that PivotNet is remarkably superior to other\nSOTAs by 5.9 mAP at least. The code will be available soon.\n","authors":["Wenjie Ding","Limeng Qiao","Xi Qiu","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16477v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.16466v1","updated":"2023-08-31T05:20:48Z","published":"2023-08-31T05:20:48Z","title":"Self-Sampling Meta SAM: Enhancing Few-shot Medical Image Segmentation\n with Meta-Learning","summary":" While the Segment Anything Model (SAM) excels in semantic segmentation for\ngeneral-purpose images, its performance significantly deteriorates when applied\nto medical images, primarily attributable to insufficient representation of\nmedical images in its training dataset. Nonetheless, gathering comprehensive\ndatasets and training models that are universally applicable is particularly\nchallenging due to the long-tail problem common in medical images. To address\nthis gap, here we present a Self-Sampling Meta SAM (SSM-SAM) framework for\nfew-shot medical image segmentation. Our innovation lies in the design of three\nkey modules: 1) An online fast gradient descent optimizer, further optimized by\na meta-learner, which ensures swift and robust adaptation to new tasks. 2) A\nSelf-Sampling module designed to provide well-aligned visual prompts for\nimproved attention allocation; and 3) A robust attention-based decoder\nspecifically designed for medical few-shot learning to capture relationship\nbetween different slices. Extensive experiments on a popular abdominal CT\ndataset and an MRI dataset demonstrate that the proposed method achieves\nsignificant improvements over state-of-the-art methods in few-shot\nsegmentation, with an average improvements of 10.21% and 1.80% in terms of DSC,\nrespectively. In conclusion, we present a novel approach for rapid online\nadaptation in interactive image segmentation, adapting to a new organ in just\n0.83 minutes. Code is publicly available on GitHub upon acceptance.\n","authors":["Yiming Zhang","Tianang Leng","Kun Han","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2308.16466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16463v1","updated":"2023-08-31T05:15:27Z","published":"2023-08-31T05:15:27Z","title":"Sparkles: Unlocking Chats Across Multiple Images for Multimodal\n Instruction-Following Models","summary":" Large language models exhibit enhanced zero-shot performance on various tasks\nwhen fine-tuned with instruction-following data. Multimodal\ninstruction-following models extend these capabilities by integrating both text\nand images. However, existing models such as MiniGPT-4 face challenges in\nmaintaining dialogue coherence in scenarios involving multiple images. A\nprimary reason is the lack of a specialized dataset for this critical\napplication. To bridge these gaps, we present SparklesChat, a multimodal\ninstruction-following model for open-ended dialogues across multiple images. To\nsupport the training, we introduce SparklesDialogue, the first\nmachine-generated dialogue dataset tailored for word-level interleaved\nmulti-image and text interactions. Furthermore, we construct SparklesEval, a\nGPT-assisted benchmark for quantitatively assessing a model's conversational\ncompetence across multiple images and dialogue turns. Our experiments validate\nthe effectiveness of SparklesChat in understanding and reasoning across\nmultiple images and dialogue turns. Specifically, SparklesChat outperformed\nMiniGPT-4 on established vision-and-language benchmarks, including the BISON\nbinary image selection task and the NLVR2 visual reasoning task. Moreover,\nSparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding\nMiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative\nevaluations further demonstrate SparklesChat's generality in handling\nreal-world applications. All resources will be available at\nhttps://github.com/HYPJUDY/Sparkles.\n","authors":["Yupan Huang","Zaiqiao Meng","Fangyu Liu","Yixuan Su","Nigel Collier","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07864v2","updated":"2023-08-31T05:11:10Z","published":"2022-11-15T03:10:05Z","title":"Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning","summary":" Federated learning (FL) enables multiple clients to collaboratively train a\nglobal model without disclosing their data. Previous researches often require\ntraining the complete model parameters. However, the emergence of powerful\npre-trained models makes it possible to achieve higher performance with fewer\nlearnable parameters in FL. In this paper, we propose a federated adaptive\nprompt tuning algorithm, FedAPT, for multi-domain collaborative image\nclassification with powerful foundation models, like CLIP. Compared with direct\nfederated prompt tuning, our core idea is to adaptively unlock specific domain\nknowledge for each test sample in order to provide them with personalized\nprompts. To implement this idea, we design an adaptive prompt tuning module,\nwhich consists of a meta prompt, an adaptive network, and some keys. The server\nrandomly generates a set of keys and assigns a unique key to each client. Then\nall clients cooperatively train the global adaptive network and meta prompt\nwith the local datasets and the frozen keys. Ultimately, the global aggregation\nmodel can assign a personalized prompt to CLIP based on the domain features of\neach test sample. We perform extensive experiments on two multi-domain image\nclassification datasets across two different settings - supervised and\nunsupervised. The results show that FedAPT can achieve better performance with\nless than 10\\% of the number of parameters of the fully trained model, and the\nglobal model can perform well in diverse client domains simultaneously.\n","authors":["Shangchao Su","Mingzhao Yang","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2211.07864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05602v2","updated":"2023-08-31T05:08:45Z","published":"2023-05-09T16:51:00Z","title":"Collaborative Chinese Text Recognition with Personalized Federated\n Learning","summary":" In Chinese text recognition, to compensate for the insufficient local data\nand improve the performance of local few-shot character recognition, it is\noften necessary for one organization to collect a large amount of data from\nsimilar organizations. However, due to the natural presence of private\ninformation in text data, such as addresses and phone numbers, different\norganizations are unwilling to share private data. Therefore, it becomes\nincreasingly important to design a privacy-preserving collaborative training\nframework for the Chinese text recognition task. In this paper, we introduce\npersonalized federated learning (pFL) into the Chinese text recognition task\nand propose the pFedCR algorithm, which significantly improves the model\nperformance of each client (organization) without sharing private data.\nSpecifically, pFedCR comprises two stages: multiple rounds of global model\ntraining stage and the the local personalization stage. During stage 1, an\nattention mechanism is incorporated into the CRNN model to adapt to various\nclient data distributions. Leveraging inherent character data characteristics,\na balanced dataset is created on the server to mitigate character imbalance. In\nthe personalization phase, the global model is fine-tuned for one epoch to\ncreate a local model. Parameter averaging between local and global models\ncombines personalized and global feature extraction capabilities. Finally, we\nfine-tune only the attention layers to enhance its focus on local personalized\nfeatures. The experimental results on three real-world industrial scenario\ndatasets show that the pFedCR algorithm can improve the performance of local\npersonalized models by about 20\\% while also improving their generalization\nperformance on other client data domains. Compared to other state-of-the-art\npersonalized federated learning methods, pFedCR improves performance by 6\\%\n$\\sim$ 8\\%.\n","authors":["Shangchao Su","Haiyang Yu","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2305.05602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16461v1","updated":"2023-08-31T05:05:53Z","published":"2023-08-31T05:05:53Z","title":"Domain Adaptive Synapse Detection with Weak Point Annotations","summary":" The development of learning-based methods has greatly improved the detection\nof synapses from electron microscopy (EM) images. However, training a model for\neach dataset is time-consuming and requires extensive annotations.\nAdditionally, it is difficult to apply a learned model to data from different\nbrain regions due to variations in data distributions. In this paper, we\npresent AdaSyn, a two-stage segmentation-based framework for domain adaptive\nsynapse detection with weak point annotations. In the first stage, we address\nthe detection problem by utilizing a segmentation-based pipeline to obtain\nsynaptic instance masks. In the second stage, we improve model generalizability\non target data by regenerating square masks to get high-quality pseudo labels.\nBenefiting from our high-accuracy detection results, we introduce the distance\nnearest principle to match paired pre-synapses and post-synapses. In the\nWASPSYN challenge at ISBI 2023, our method ranks the 1st place.\n","authors":["Qi Chen","Wei Huang","Yueyi Zhang","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.16461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16460v1","updated":"2023-08-31T04:58:17Z","published":"2023-08-31T04:58:17Z","title":"Improving Lens Flare Removal with General Purpose Pipeline and Multiple\n Light Sources Recovery","summary":" When taking images against strong light sources, the resulting images often\ncontain heterogeneous flare artifacts. These artifacts can importantly affect\nimage visual quality and downstream computer vision tasks. While collecting\nreal data pairs of flare-corrupted/flare-free images for training flare removal\nmodels is challenging, current methods utilize the direct-add approach to\nsynthesize data. However, these methods do not consider automatic exposure and\ntone mapping in image signal processing pipeline (ISP), leading to the limited\ngeneralization capability of deep models training using such data. Besides,\nexisting methods struggle to handle multiple light sources due to the different\nsizes, shapes and illuminance of various light sources. In this paper, we\npropose a solution to improve the performance of lens flare removal by\nrevisiting the ISP and remodeling the principle of automatic exposure in the\nsynthesis pipeline and design a more reliable light sources recovery strategy.\nThe new pipeline approaches realistic imaging by discriminating the local and\nglobal illumination through convex combination, avoiding global illumination\nshifting and local over-saturation. Our strategy for recovering multiple light\nsources convexly averages the input and output of the neural network based on\nilluminance levels, thereby avoiding the need for a hard threshold in\nidentifying light sources. We also contribute a new flare removal testing\ndataset containing the flare-corrupted images captured by ten types of consumer\nelectronics. The dataset facilitates the verification of the generalization\ncapability of flare removal methods. Extensive experiments show that our\nsolution can effectively improve the performance of lens flare removal and push\nthe frontier toward more general situations.\n","authors":["Yuyan Zhou","Dong Liang","Songcan Chen","Sheng-Jun Huang","Shuo Yang","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.16460v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16454v1","updated":"2023-08-31T04:46:12Z","published":"2023-08-31T04:46:12Z","title":"Adversarial Finetuning with Latent Representation Constraint to Mitigate\n Accuracy-Robustness Tradeoff","summary":" This paper addresses the tradeoff between standard accuracy on clean examples\nand robustness against adversarial examples in deep neural networks (DNNs).\nAlthough adversarial training (AT) improves robustness, it degrades the\nstandard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we\npropose a novel AT method called ARREST, which comprises three components: (i)\nadversarial finetuning (AFT), (ii) representation-guided knowledge distillation\n(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples\nby initializing its parameters with a DNN that is standardly pretrained on\nclean examples. RGKD and NR respectively entail a regularization term and an\nalgorithm to preserve latent representations of clean examples during AFT. RGKD\npenalizes the distance between the representations of the standardly pretrained\nand AFT DNNs. NR switches input adversarial examples to nonadversarial ones\nwhen the representation changes significantly during AFT. By combining these\ncomponents, ARREST achieves both high standard accuracy and robustness.\nExperimental results demonstrate that ARREST mitigates the tradeoff more\neffectively than previous AT-based methods do.\n","authors":["Satoshi Suzuki","Shin'ya Yamaguchi","Shoichiro Takeda","Sekitoshi Kanai","Naoki Makishima","Atsushi Ando","Ryo Masumura"],"pdf_url":"https://arxiv.org/pdf/2308.16454v1.pdf","comment":"Accepted by International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2304.06028v2","updated":"2023-08-31T04:36:04Z","published":"2023-04-12T17:59:58Z","title":"RECLIP: Resource-efficient CLIP by Training with Small Images","summary":" We present RECLIP (Resource-efficient CLIP), a simple method that minimizes\ncomputational resource footprint for CLIP (Contrastive Language Image\nPretraining). Inspired by the notion of coarse-to-fine in computer vision, we\nleverage small images to learn from large-scale language supervision\nefficiently, and finetune the model with high-resolution data in the end. Since\nthe complexity of the vision transformer heavily depends on input image size,\nour approach significantly reduces the training resource requirements both in\ntheory and in practice. Using the same batch size and training epoch, RECLIP\nachieves highly competitive zero-shot classification and image-text retrieval\naccuracy with 6 to 8x less computational resources and 7 to 9x fewer FLOPs than\nthe baseline. Compared to the state-of-the-art contrastive learning methods,\nRECLIP demonstrates 5 to 59x training resource savings while maintaining highly\ncompetitive zero-shot classification and retrieval performance. Finally, RECLIP\nmatches the state of the art in transfer learning to open-vocabulary detection\ntasks, achieving 32 APr on LVIS. We hope this work will pave the path for the\nbroader research community to explore language supervised pretraining in\nresource-friendly settings.\n","authors":["Runze Li","Dahun Kim","Bir Bhanu","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2304.06028v2.pdf","comment":"Published at Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2207.13085v3","updated":"2023-08-31T04:00:18Z","published":"2022-07-26T17:57:58Z","title":"Group DETR: Fast DETR Training with Group-Wise One-to-Many Assignment","summary":" Detection transformer (DETR) relies on one-to-one assignment, assigning one\nground-truth object to one prediction, for end-to-end detection without NMS\npost-processing. It is known that one-to-many assignment, assigning one\nground-truth object to multiple predictions, succeeds in detection methods such\nas Faster R-CNN and FCOS. While the naive one-to-many assignment does not work\nfor DETR, and it remains challenging to apply one-to-many assignment for DETR\ntraining. In this paper, we introduce Group DETR, a simple yet efficient DETR\ntraining approach that introduces a group-wise way for one-to-many assignment.\nThis approach involves using multiple groups of object queries, conducting\none-to-one assignment within each group, and performing decoder self-attention\nseparately. It resembles data augmentation with automatically-learned object\nquery augmentation. It is also equivalent to simultaneously training\nparameter-sharing networks of the same architecture, introducing more\nsupervision and thus improving DETR training. The inference process is the same\nas DETR trained normally and only needs one group of queries without any\narchitecture modification. Group DETR is versatile and is applicable to various\nDETR variants. The experiments show that Group DETR significantly speeds up the\ntraining convergence and improves the performance of various DETR-based models.\nCode will be available at \\url{https://github.com/Atten4Vis/GroupDETR}.\n","authors":["Qiang Chen","Xiaokang Chen","Jian Wang","Shan Zhang","Kun Yao","Haocheng Feng","Junyu Han","Errui Ding","Gang Zeng","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2207.13085v3.pdf","comment":"ICCV23 camera ready version"},{"id":"http://arxiv.org/abs/2308.16435v1","updated":"2023-08-31T03:49:41Z","published":"2023-08-31T03:49:41Z","title":"Njobvu-AI: An open-source tool for collaborative image labeling and\n implementation of computer vision models","summary":" Practitioners interested in using computer vision models lack user-friendly\nand open-source software that combines features to label training data, allow\nmultiple users, train new algorithms, review output, and implement new models.\nLabeling training data, such as images, is a key step to developing accurate\nobject detection algorithms using computer vision. This step is often not\ncompatible with many cloud-based services for marking or labeling image and\nvideo data due to limited internet bandwidth in many regions of the world.\nDesktop tools are useful for groups working in remote locations, but users\noften do not have the capability to combine projects developed locally by\nmultiple collaborators. Furthermore, many tools offer features for labeling\ndata or using pre-trained models for classification, but few allow researchers\nto combine these steps to create and apply custom models. Free, open-source,\nand user-friendly software that offers a full suite of features (e.g., ability\nto work locally and online, and train custom models) is desirable to field\nresearchers and conservationists that may have limited coding skills. We\ndeveloped Njobvu-AI, a free, open-source tool that can be run on both desktop\nand server hardware using Node.js, allowing users to label data, combine\nprojects for collaboration and review, train custom algorithms, and implement\nnew computer vision models. The name Njobvu-AI (pronounced N-joh-voo AI),\nincorporating the Chichewa word for elephant, is inspired by a wildlife\nmonitoring program in Malawi that was a primary impetus for the development of\nthis tool and references similarities between the powerful memory of elephants\nand properties of computer vision models.\n","authors":["Jonathan S. Koning","Ashwin Subramanian","Mazen Alotaibi","Cara L. Appel","Christopher M. Sullivan","Thon Chao","Lisa Truong","Robyn L. Tanguay","Pankaj Jaiswal","Taal Levi","Damon B. Lesmeister"],"pdf_url":"https://arxiv.org/pdf/2308.16435v1.pdf","comment":"13 pages, 6 figures. For code and documentation, see\n https://github.com/sullichrosu/Njobvu-AI/"},{"id":"http://arxiv.org/abs/2307.07873v5","updated":"2023-08-31T03:47:35Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding and Improving Adversarial\n Transferability from Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v5.pdf","comment":"IEEE Symposium on Security and Privacy (Oakland) 2024; Extended\n version of camera-ready"},{"id":"http://arxiv.org/abs/2305.07283v3","updated":"2023-08-31T03:47:34Z","published":"2023-05-12T06:56:22Z","title":"Quaternion-valued Correlation Learning for Few-Shot Semantic\n Segmentation","summary":" Few-shot segmentation (FSS) aims to segment unseen classes given only a few\nannotated samples. Encouraging progress has been made for FSS by leveraging\nsemantic features learned from base classes with sufficient training samples to\nrepresent novel classes. The correlation-based methods lack the ability to\nconsider interaction of the two subspace matching scores due to the inherent\nnature of the real-valued 2D convolutions. In this paper, we introduce a\nquaternion perspective on correlation learning and propose a novel\nQuaternion-valued Correlation Learning Network (QCLNet), with the aim to\nalleviate the computational burden of high-dimensional correlation tensor and\nexplore internal latent interaction between query and support images by\nleveraging operations defined by the established quaternion algebra.\nSpecifically, our QCLNet is formulated as a hyper-complex valued network and\nrepresents correlation tensors in the quaternion domain, which uses\nquaternion-valued convolution to explore the external relations of query\nsubspace when considering the hidden relationship of the support sub-dimension\nin the quaternion space. Extensive experiments on the PASCAL-5i and COCO-20i\ndatasets demonstrate that our method outperforms the existing state-of-the-art\nmethods effectively. Our code is available at\nhttps://github.com/zwzheng98/QCLNet and our article \"Quaternion-valued\nCorrelation Learning for Few-Shot Semantic Segmentation\" was published in IEEE\nTransactions on Circuits and Systems for Video Technology, vol.\n33,no.5,pp.2102-2115,May 2023,doi: 10.1109/TCSVT.2022.3223150.\n","authors":["Zewen Zheng","Guoheng Huang","Xiaochen Yuan","Chi-Man Pun","Hongrui Liu","Wing-Kuen Ling"],"pdf_url":"https://arxiv.org/pdf/2305.07283v3.pdf","comment":"for associated paper file, see\n https://ieeexplore.ieee.org/document/9954424?source=authoralert"},{"id":"http://arxiv.org/abs/2308.14604v2","updated":"2023-08-31T03:07:03Z","published":"2023-08-28T14:17:16Z","title":"SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space\n Reconstruction","summary":" Segment Anything Model (SAM) has received remarkable attention as it offers a\npowerful and versatile solution for object segmentation in images. However,\nfine-tuning SAM for downstream segmentation tasks under different scenarios\nremains a challenge, as the varied characteristics of different scenarios\nnaturally requires diverse model parameter spaces. Most existing fine-tuning\nmethods attempt to bridge the gaps among different scenarios by introducing a\nset of new parameters to modify SAM's original parameter space. Unlike these\nworks, in this paper, we propose fine-tuning SAM efficiently by parameter space\nreconstruction (SAM-PARSER), which introduce nearly zero trainable parameters\nduring fine-tuning. In SAM-PARSER, we assume that SAM's original parameter\nspace is relatively complete, so that its bases are able to reconstruct the\nparameter space of a new scenario. We obtain the bases by matrix decomposition,\nand fine-tuning the coefficients to reconstruct the parameter space tailored to\nthe new scenario by an optimal linear combination of the bases. Experimental\nresults show that SAM-PARSER exhibits superior segmentation performance across\nvarious scenarios, while reducing the number of trainable parameters by\n$\\approx 290$ times compared with current parameter-efficient fine-tuning\nmethods.\n","authors":["Zelin Peng","Zhengqin Xu","Zhilin Zeng","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.00780v5","updated":"2023-08-31T02:27:48Z","published":"2022-07-26T10:59:42Z","title":"Visual correspondence-based explanations improve AI robustness and\n human-AI team accuracy","summary":" Explaining artificial intelligence (AI) predictions is increasingly important\nand even imperative in many high-stakes applications where humans are the\nultimate decision-makers. In this work, we propose two novel architectures of\nself-interpretable image classifiers that first explain, and then predict (as\nopposed to post-hoc explanations) by harnessing the visual correspondences\nbetween a query image and exemplars. Our models consistently improve (by 1 to 4\npoints) on out-of-distribution (OOD) datasets while performing marginally worse\n(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest\nneighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB,\nour correspondence-based explanations are found to be more useful to users than\nkNN explanations. Our explanations help users more accurately reject AI's wrong\ndecisions than all other tested methods. Interestingly, for the first time, we\nshow that it is possible to achieve complementary human-AI team accuracy (i.e.,\nthat is higher than either AI-alone or human-alone), in ImageNet and CUB image\nclassification tasks.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2208.00780v5.pdf","comment":"NeurIPS 2022 conference paper"},{"id":"http://arxiv.org/abs/2308.15690v2","updated":"2023-08-31T02:21:20Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v2.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"},{"id":"http://arxiv.org/abs/2308.16404v1","updated":"2023-08-31T02:13:15Z","published":"2023-08-31T02:13:15Z","title":"Deformation Robust Text Spotting with Geometric Prior","summary":" The goal of text spotting is to perform text detection and recognition in an\nend-to-end manner. Although the diversity of luminosity and orientation in\nscene texts has been widely studied, the font diversity and shape variance of\nthe same character are ignored in recent works, since most characters in\nnatural images are rendered in standard fonts. To solve this problem, we\npresent a Chinese Artistic Dataset, termed as ARText, which contains 33,000\nartistic images with rich shape deformation and font diversity. Based on this\ndatabase, we develop a deformation robust text spotting method (DR TextSpotter)\nto solve the recognition problem of complex deformation of characters in\ndifferent fonts. Specifically, we propose a geometric prior module to highlight\nthe important features based on the unsupervised landmark detection\nsub-network. A graph convolution network is further constructed to fuse the\ncharacter features and landmark features, and then performs semantic reasoning\nto enhance the discrimination for different characters. The experiments are\nconducted on ARText and IC19-ReCTS datasets. Our results demonstrate the\neffectiveness of our proposed method.\n","authors":["Xixuan Hao","Aozhong Zhang","Xianze Meng","Bin Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16145v2","updated":"2023-08-31T01:29:35Z","published":"2023-08-30T17:01:01Z","title":"CircleFormer: Circular Nuclei Detection in Whole Slide Images with\n Circle Queries and Attention","summary":" Both CNN-based and Transformer-based object detection with bounding box\nrepresentation have been extensively studied in computer vision and medical\nimage analysis, but circular object detection in medical images is still\nunderexplored. Inspired by the recent anchor free CNN-based circular object\ndetection method (CircleNet) for ball-shape glomeruli detection in renal\npathology, in this paper, we present CircleFormer, a Transformer-based circular\nmedical object detection with dynamic anchor circles. Specifically, queries\nwith circle representation in Transformer decoder iteratively refine the\ncircular object detection results, and a circle cross attention module is\nintroduced to compute the similarity between circular queries and image\nfeatures. A generalized circle IoU (gCIoU) is proposed to serve as a new\nregression loss of circular object detection as well. Moreover, our approach is\neasy to generalize to the segmentation task by adding a simple segmentation\nbranch to CircleFormer. We evaluate our method in circular nuclei detection and\nsegmentation on the public MoNuSeg dataset, and the experimental results show\nthat our method achieves promising performance compared with the\nstate-of-the-art approaches. The effectiveness of each component is validated\nvia ablation studies as well. Our code is released at\nhttps://github.com/zhanghx-iim-ahu/CircleFormer.\n","authors":["Hengxu Zhang","Pengpeng Liang","Zhiyong Sun","Bo Song","Erkang Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.16145v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.16386v1","updated":"2023-08-31T01:13:01Z","published":"2023-08-31T01:13:01Z","title":"RGB-T Tracking via Multi-Modal Mutual Prompt Learning","summary":" Object tracking based on the fusion of visible and thermal im-ages, known as\nRGB-T tracking, has gained increasing atten-tion from researchers in recent\nyears. How to achieve a more comprehensive fusion of information from the two\nmodalities with fewer computational costs has been a problem that re-searchers\nhave been exploring. Recently, with the rise of prompt learning in computer\nvision, we can better transfer knowledge from visual large models to downstream\ntasks. Considering the strong complementarity between visible and thermal\nmodalities, we propose a tracking architecture based on mutual prompt learning\nbetween the two modalities. We also design a lightweight prompter that\nincorporates attention mechanisms in two dimensions to transfer information\nfrom one modality to the other with lower computational costs, embedding it\ninto each layer of the backbone. Extensive ex-periments have demonstrated that\nour proposed tracking ar-chitecture is effective and efficient, achieving\nstate-of-the-art performance while maintaining high running speeds.\n","authors":["Yang Luo","Xiqing Guo","Hui Feng","Lei Ao"],"pdf_url":"https://arxiv.org/pdf/2308.16386v1.pdf","comment":"9 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.16383v1","updated":"2023-08-31T01:00:59Z","published":"2023-08-31T01:00:59Z","title":"Separate and Locate: Rethink the Text in Text-based Visual Question\n Answering","summary":" Text-based Visual Question Answering (TextVQA) aims at answering questions\nabout the text in images. Most works in this field focus on designing network\nstructures or pre-training tasks. All these methods list the OCR texts in\nreading order (from left to right and top to bottom) to form a sequence, which\nis treated as a natural language ``sentence''. However, they ignore the fact\nthat most OCR words in the TextVQA task do not have a semantical contextual\nrelationship. In addition, these approaches use 1-D position embedding to\nconstruct the spatial relation between OCR tokens sequentially, which is not\nreasonable. The 1-D position embedding can only represent the left-right\nsequence relationship between words in a sentence, but not the complex spatial\nposition relationship. To tackle these problems, we propose a novel method\nnamed Separate and Locate (SaL) that explores text contextual cues and designs\nspatial position embedding to construct spatial relations between OCR texts.\nSpecifically, we propose a Text Semantic Separate (TSS) module that helps the\nmodel recognize whether words have semantic contextual relations. Then, we\nintroduce a Spatial Circle Position (SCP) module that helps the model better\nconstruct and reason the spatial position relationships between OCR texts. Our\nSaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA\nand ST-VQA datasets. Compared with the pre-training state-of-the-art method\npre-trained on 64 million pre-training samples, our method, without any\npre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on\nTextVQA and ST-VQA. Our code and models will be released at\nhttps://github.com/fangbufang/SaL.\n","authors":["Chengyang Fang","Jiangnan Li","Liang Li","Can Ma","Dayong Hu"],"pdf_url":"https://arxiv.org/pdf/2308.16383v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/1912.10122v3","updated":"2023-08-31T00:53:12Z","published":"2019-12-20T22:17:50Z","title":"A Region-based Randers Geodesic Approach for Image Segmentation","summary":" The geodesic model based on the eikonal partial differential equation (PDE)\nhas served as a fundamental tool for the applications of image segmentation and\nboundary detection in the past two decades. However, the existing approaches\ncommonly only exploit the image edge-based features for computing minimal\ngeodesic paths, potentially limiting their performance in complicated\nsegmentation situations. In this paper, we introduce a new variational image\nsegmentation model based on the minimal geodesic path framework and the eikonal\nPDE, where the region-based appearance term that defines then regional\nhomogeneity features can be taken into account for estimating the associated\nminimal geodesic paths. This is done by constructing a Randers geodesic metric\ninterpretation of the region-based active contour energy functional. As a\nresult, the minimization of the active contour energy functional is transformed\ninto finding the solution to the Randers eikonal PDE.\n We also suggest a practical interactive image segmentation strategy, where\nthe target boundary can be delineated by the concatenation of several piecewise\ngeodesic paths. We invoke the Finsler variant of the fast marching method to\nestimate the geodesic distance map, yielding an efficient implementation of the\nproposed region-based Randers geodesic model for image segmentation.\nExperimental results on both synthetic and real images exhibit that our model\nindeed achieves encouraging segmentation performance.\n","authors":["Da Chen","Jean-Marie Mirebeau","Huazhong Shu","Laurent D. Cohen"],"pdf_url":"https://arxiv.org/pdf/1912.10122v3.pdf","comment":"To Appear in International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2308.16154v2","updated":"2023-08-31T00:51:45Z","published":"2023-08-30T17:20:46Z","title":"MMVP: Motion-Matrix-based Video Prediction","summary":" A central challenge of video prediction lies where the system has to reason\nthe objects' future motions from image frames while simultaneously maintaining\nthe consistency of their appearances across frames. This work introduces an\nend-to-end trainable two-stream video prediction framework, Motion-Matrix-based\nVideo Prediction (MMVP), to tackle this challenge. Unlike previous methods that\nusually handle motion prediction and appearance maintenance within the same set\nof modules, MMVP decouples motion and appearance information by constructing\nappearance-agnostic motion matrices. The motion matrices represent the temporal\nsimilarity of each and every pair of feature patches in the input frames, and\nare the sole input of the motion prediction module in MMVP. This design\nimproves video prediction in both accuracy and efficiency, and reduces the\nmodel size. Results of extensive experiments demonstrate that MMVP outperforms\nstate-of-the-art systems on public data sets by non-negligible large margins\n(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the\nsize or smaller).\n","authors":["Yiqi Zhong","Luming Liang","Ilya Zharkov","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16154v2.pdf","comment":"ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2308.16380v1","updated":"2023-08-31T00:48:05Z","published":"2023-08-31T00:48:05Z","title":"3D vision-based structural masonry damage detection","summary":" The detection of masonry damage is essential for preventing potentially\ndisastrous outcomes. Manual inspection can, however, take a long time and be\nhazardous to human inspectors. Automation of the inspection process using novel\ncomputer vision and machine learning algorithms can be a more efficient and\nsafe solution to prevent further deterioration of the masonry structures. Most\nexisting 2D vision-based methods are limited to qualitative damage\nclassification, 2D localization, and in-plane quantification. In this study, we\npresent a 3D vision-based methodology for accurate masonry damage detection,\nwhich offers a more robust solution with a greater field of view, depth of\nvision, and the ability to detect failures in complex environments. First,\nimages of the masonry specimens are collected to generate a 3D point cloud.\nSecond, 3D point clouds processing methods are developed to evaluate the\nmasonry damage. We demonstrate the effectiveness of our approach through\nexperiments on structural masonry components. Our experiments showed the\nproposed system can effectively classify damage states and localize and\nquantify critical damage features. The result showed the proposed method can\nimprove the level of autonomy during the inspection of masonry structures.\n","authors":["Elmira Faraji Zonouz","Xiao Pan","Yu-Cheng Hsu","Tony Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16380v1.pdf","comment":"10 pages, accepted in the Canadian Conference - Pacific Conference on\n Earthquake Engineering 2023, Vancouver, British Columbia"},{"id":"http://arxiv.org/abs/2308.15791v2","updated":"2023-08-31T00:46:47Z","published":"2023-08-30T06:49:34Z","title":"Neural Video Compression with Temporal Layer-Adaptive Hierarchical\n B-frame Coding","summary":" Neural video compression (NVC) is a rapidly evolving video coding research\narea, with some models achieving superior coding efficiency compared to the\nlatest video coding standard Versatile Video Coding (VVC). In conventional\nvideo coding standards, the hierarchical B-frame coding, which utilizes a\nbidirectional prediction structure for higher compression, had been\nwell-studied and exploited. In NVC, however, limited research has investigated\nthe hierarchical B scheme. In this paper, we propose an NVC model exploiting\nhierarchical B-frame coding with temporal layer-adaptive optimization. We first\nextend an existing unidirectional NVC model to a bidirectional model, which\nachieves -21.13% BD-rate gain over the unidirectional baseline model. However,\nthis model faces challenges when applied to sequences with complex or large\nmotions, leading to performance degradation. To address this, we introduce\ntemporal layer-adaptive optimization, incorporating methods such as temporal\nlayer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent\nscaling (TALS). The final model with the proposed methods achieves an\nimpressive BD-rate gain of -39.86% against the baseline. It also resolves the\nchallenges in sequences with large or complex motions with up to -49.13% more\nBD-rate gains than the simple bidirectional extension. This improvement is\nattributed to the allocation of more bits to lower temporal layers, thereby\nenhancing overall reconstruction quality with smaller bits. Since our method\nhas little dependency on a specific NVC model architecture, it can serve as a\ngeneral tool for extending unidirectional NVC models to the ones with\nhierarchical B-frame coding.\n","authors":["Yeongwoong Kim","Suyong Bahk","Seungeon Kim","Won Hee Lee","Dokwan Oh","Hui Yong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16376v1","updated":"2023-08-31T00:36:10Z","published":"2023-08-31T00:36:10Z","title":"Improving Multiple Sclerosis Lesion Segmentation Across Clinical Sites:\n A Federated Learning Approach with Noise-Resilient Training","summary":" Accurately measuring the evolution of Multiple Sclerosis (MS) with magnetic\nresonance imaging (MRI) critically informs understanding of disease progression\nand helps to direct therapeutic strategy. Deep learning models have shown\npromise for automatically segmenting MS lesions, but the scarcity of accurately\nannotated data hinders progress in this area. Obtaining sufficient data from a\nsingle clinical site is challenging and does not address the heterogeneous need\nfor model robustness. Conversely, the collection of data from multiple sites\nintroduces data privacy concerns and potential label noise due to varying\nannotation standards. To address this dilemma, we explore the use of the\nfederated learning framework while considering label noise. Our approach\nenables collaboration among multiple clinical sites without compromising data\nprivacy under a federated learning paradigm that incorporates a noise-robust\ntraining strategy based on label correction. Specifically, we introduce a\nDecoupled Hard Label Correction (DHLC) strategy that considers the imbalanced\ndistribution and fuzzy boundaries of MS lesions, enabling the correction of\nfalse annotations based on prediction confidence. We also introduce a Centrally\nEnhanced Label Correction (CELC) strategy, which leverages the aggregated\ncentral model as a correction teacher for all sites, enhancing the reliability\nof the correction process. Extensive experiments conducted on two multi-site\ndatasets demonstrate the effectiveness and robustness of our proposed methods,\nindicating their potential for clinical applications in multi-site\ncollaborations.\n","authors":["Lei Bai","Dongang Wang","Michael Barnett","Mariano Cabezas","Weidong Cai","Fernando Calamante","Kain Kyle","Dongnan Liu","Linda Ly","Aria Nguyen","Chun-Chien Shieh","Ryan Sullivan","Hengrui Wang","Geng Zhan","Wanli Ouyang","Chenyu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16376v1.pdf","comment":"11 pages, 4 figures, journal submission"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.16761v1","updated":"2023-08-31T14:29:10Z","published":"2023-08-31T14:29:10Z","title":"Co-evolving Vector Quantization for ID-based Recommendation","summary":" Category information plays a crucial role in enhancing the quality and\npersonalization of recommendations. Nevertheless, the availability of item\ncategory information is not consistently present, particularly in the context\nof ID-based recommendations. In this work, we propose an alternative approach\nto automatically learn and generate entity (i.e., user and item) categorical\ninformation at different levels of granularity, specifically for ID-based\nrecommendation. Specifically, we devise a co-evolving vector quantization\nframework, namely COVE, which enables the simultaneous learning and refinement\nof code representation and entity embedding in an end-to-end manner, starting\nfrom the randomly initialized states. With its high adaptability, COVE can be\neasily integrated into existing recommendation models. We validate the\neffectiveness of COVE on various recommendation tasks including list\ncompletion, collaborative filtering, and click-through rate prediction, across\ndifferent recommendation models. We will publish the code and data for other\nresearchers to reproduce our work.\n","authors":["Qijiong Liu","Jiaren Xiao","Lu Fan","Jieming Zhu","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.16761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16753v1","updated":"2023-08-31T14:19:50Z","published":"2023-08-31T14:19:50Z","title":"Context Aware Query Rewriting for Text Rankers using LLM","summary":" Query rewriting refers to an established family of approaches that are\napplied to underspecified and ambiguous queries to overcome the vocabulary\nmismatch problem in document ranking. Queries are typically rewritten during\nquery processing time for better query modelling for the downstream ranker.\nWith the advent of large-language models (LLMs), there have been initial\ninvestigations into using generative approaches to generate pseudo documents to\ntackle this inherent vocabulary gap. In this work, we analyze the utility of\nLLMs for improved query rewriting for text ranking tasks. We find that there\nare two inherent limitations of using LLMs as query re-writers -- concept drift\nwhen using only queries as prompts and large inference costs during query\nprocessing. We adopt a simple, yet surprisingly effective, approach called\ncontext aware query rewriting (CAR) to leverage the benefits of LLMs for query\nunderstanding. Firstly, we rewrite ambiguous training queries by context-aware\nprompting of LLMs, where we use only relevant documents as context.Unlike\nexisting approaches, we use LLM-based query rewriting only during the training\nphase. Eventually, a ranker is fine-tuned on the rewritten queries instead of\nthe original queries during training. In our extensive experiments, we find\nthat fine-tuning a ranker using re-written queries offers a significant\nimprovement of up to 33% on the passage ranking task and up to 28% on the\ndocument ranking task when compared to the baseline performance of using\noriginal queries.\n","authors":["Abhijit Anand","Venktesh V","Vinay Setty","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2308.16753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14276v2","updated":"2023-08-31T14:05:51Z","published":"2023-08-28T03:15:37Z","title":"Alleviating Video-Length Effect for Micro-video Recommendation","summary":" Micro-videos platforms such as TikTok are extremely popular nowadays. One\nimportant feature is that users no longer select interested videos from a set,\ninstead they either watch the recommended video or skip to the next one. As a\nresult, the time length of users' watching behavior becomes the most important\nsignal for identifying preferences. However, our empirical data analysis has\nshown a video-length effect that long videos are easier to receive a higher\nvalue of average view time, thus adopting such view-time labels for measuring\nuser preferences can easily induce a biased model that favors the longer\nvideos. In this paper, we propose a Video Length Debiasing Recommendation\n(VLDRec) method to alleviate such an effect for micro-video recommendation.\nVLDRec designs the data labeling approach and the sample generation module that\nbetter capture user preferences in a view-time oriented manner. It further\nleverages the multi-task learning technique to jointly optimize the above\nsamples with original biased ones. Extensive experiments show that VLDRec can\nimprove the users' view time by 1.81% and 11.32% on two real-world datasets,\ngiven a recommendation list of a fixed overall video length, compared with the\nbest baseline method. Moreover, VLDRec is also more effective in matching\nusers' interests in terms of the video content.\n","authors":["Yuhan Quan","Jingtao Ding","Chen Gao","Nian Li","Lingling Yi","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.14276v2.pdf","comment":"Accept by TOIS"},{"id":"http://arxiv.org/abs/2305.06566v4","updated":"2023-08-31T13:43:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16708v1","updated":"2023-08-31T13:24:57Z","published":"2023-08-31T13:24:57Z","title":"Concentrating on the Impact: Consequence-based Explanations in\n Recommender Systems","summary":" Recommender systems assist users in decision-making, where the presentation\nof recommended items and their explanations are critical factors for enhancing\nthe overall user experience. Although various methods for generating\nexplanations have been proposed, there is still room for improvement,\nparticularly for users who lack expertise in a specific item domain. In this\nstudy, we introduce the novel concept of \\textit{consequence-based\nexplanations}, a type of explanation that emphasizes the individual impact of\nconsuming a recommended item on the user, which makes the effect of following\nrecommendations clearer. We conducted an online user study to examine our\nassumption about the appreciation of consequence-based explanations and their\nimpacts on different explanation aims in recommender systems. Our findings\nhighlight the importance of consequence-based explanations, which were\nwell-received by users and effectively improved user satisfaction in\nrecommender systems. These results provide valuable insights for designing\nengaging explanations that can enhance the overall user experience in\ndecision-making.\n","authors":["Sebastian Lubos","Thi Ngoc Trang Tran","Seda Polat Erdeniz","Merfat El Mansi","Alexander Felfernig","Manfred Wundara","Gerhard Leitner"],"pdf_url":"https://arxiv.org/pdf/2308.16708v1.pdf","comment":"Preprint of the paper to be presented at IntRS'23: Joint Workshop on\n Interfaces and Human Decision Making for Recommender Systems, September 18,\n 2023, Singapore. paper will be published in the workshop proceedings"},{"id":"http://arxiv.org/abs/2307.15464v5","updated":"2023-08-31T12:50:59Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":" Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v5.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2302.07669v2","updated":"2023-08-31T11:24:15Z","published":"2023-02-15T14:06:39Z","title":"Unsupervised Hashing with Similarity Distribution Calibration","summary":" Unsupervised hashing methods typically aim to preserve the similarity between\ndata points in a feature space by mapping them to binary hash codes. However,\nthese methods often overlook the fact that the similarity between data points\nin the continuous feature space may not be preserved in the discrete hash code\nspace, due to the limited similarity range of hash codes. The similarity range\nis bounded by the code length and can lead to a problem known as similarity\ncollapse. That is, the positive and negative pairs of data points become less\ndistinguishable from each other in the hash space. To alleviate this problem,\nin this paper a novel Similarity Distribution Calibration (SDC) method is\nintroduced. SDC aligns the hash code similarity distribution towards a\ncalibration distribution (e.g., beta distribution) with sufficient spread\nacross the entire similarity range, thus alleviating the similarity collapse\nproblem. Extensive experiments show that our SDC outperforms significantly the\nstate-of-the-art alternatives on coarse category-level and instance-level image\nretrieval. Code is available at https://github.com/kamwoh/sdc.\n","authors":["Kam Woh Ng","Xiatian Zhu","Jiun Tian Hoe","Chee Seng Chan","Tianyu Zhang","Yi-Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2302.07669v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2308.16609v1","updated":"2023-08-31T10:12:32Z","published":"2023-08-31T10:12:32Z","title":"Towards Long-Tailed Recognition for Graph Classification via\n Collaborative Experts","summary":" Graph classification, aiming at learning the graph-level representations for\neffective class assignments, has received outstanding achievements, which\nheavily relies on high-quality datasets that have balanced class distribution.\nIn fact, most real-world graph data naturally presents a long-tailed form,\nwhere the head classes occupy much more samples than the tail classes, it thus\nis essential to study the graph-level classification over long-tailed data\nwhile still remaining largely unexplored. However, most existing long-tailed\nlearning methods in visions fail to jointly optimize the representation\nlearning and classifier training, as well as neglect the mining of the\nhard-to-classify classes. Directly applying existing methods to graphs may lead\nto sub-optimal performance, since the model trained on graphs would be more\nsensitive to the long-tailed distribution due to the complex topological\ncharacteristics. Hence, in this paper, we propose a novel long-tailed\ngraph-level classification framework via Collaborative Multi-expert Learning\n(CoMe) to tackle the problem. To equilibrate the contributions of head and tail\nclasses, we first develop balanced contrastive learning from the view of\nrepresentation learning, and then design an individual-expert classifier\ntraining based on hard class mining. In addition, we execute gated fusion and\ndisentangled knowledge distillation among the multiple experts to promote the\ncollaboration in a multi-expert framework. Comprehensive experiments are\nperformed on seven widely-used benchmark datasets to demonstrate the\nsuperiority of our method CoMe over state-of-the-art baselines.\n","authors":["Siyu Yi","Zhengyang Mao","Wei Ju","Yongdao Zhou","Luchen Liu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16609v1.pdf","comment":"Accepted by IEEE Transactions on Big Data (TBD 2024)"},{"id":"http://arxiv.org/abs/2308.16505v1","updated":"2023-08-31T07:36:44Z","published":"2023-08-31T07:36:44Z","title":"Recommender AI Agent: Integrating Large Language Models for Interactive\n Recommendations","summary":" Recommender models excel at providing domain-specific item recommendations by\nleveraging extensive user behavior data. Despite their ability to act as\nlightweight domain experts, they struggle to perform versatile tasks such as\nproviding explanations and engaging in conversations. On the other hand, large\nlanguage models (LLMs) represent a significant step towards artificial general\nintelligence, showcasing remarkable capabilities in instruction comprehension,\ncommonsense reasoning, and human interaction. However, LLMs lack the knowledge\nof domain-specific item catalogs and behavioral patterns, particularly in areas\nthat diverge from general world knowledge, such as online e-commerce.\nFinetuning LLMs for each domain is neither economic nor efficient.\n In this paper, we bridge the gap between recommender models and LLMs,\ncombining their respective strengths to create a versatile and interactive\nrecommender system. We introduce an efficient framework called RecAgent, which\nemploys LLMs as the brain and recommender models as tools. We first outline a\nminimal set of essential tools required to transform LLMs into RecAgent. We\nthen propose an efficient workflow within RecAgent for task execution,\nincorporating key components such as a memory bus, dynamic\ndemonstration-augmented task planning, and reflection. RecAgent enables\ntraditional recommender systems, such as those ID-based matrix factorization\nmodels, to become interactive systems with a natural language interface through\nthe integration of LLMs. Experimental results on several public datasets show\nthat RecAgent achieves satisfying performance as a conversational recommender\nsystem, outperforming general-purpose LLMs.\n","authors":["Xu Huang","Jianxun Lian","Yuxuan Lei","Jing Yao","Defu Lian","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.16505v1.pdf","comment":"16 pages, 15 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.16437v1","updated":"2023-08-31T03:52:57Z","published":"2023-08-31T03:52:57Z","title":"AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR\n Prediction","summary":" Click-through rate (CTR) prediction is a crucial issue in recommendation\nsystems. There has been an emergence of various public CTR datasets. However,\nexisting datasets primarily suffer from the following limitations. Firstly,\nusers generally click different types of items from multiple scenarios, and\nmodeling from multiple scenarios can provide a more comprehensive understanding\nof users. Existing datasets only include data for the same type of items from a\nsingle scenario. Secondly, multi-modal features are essential in multi-scenario\nprediction as they address the issue of inconsistent ID encoding between\ndifferent scenarios. The existing datasets are based on ID features and lack\nmulti-modal features. Third, a large-scale dataset can provide a more reliable\nevaluation of models, fully reflecting the performance differences between\nmodels. The scale of existing datasets is around 100 million, which is\nrelatively small compared to the real-world CTR prediction. To address these\nlimitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset\nbased on industrial data from Alipay. Specifically, AntM$^{2}$C provides the\nfollowing advantages: 1) It covers CTR data of 5 different types of items,\nproviding insights into the preferences of users for different items, including\nadvertisements, vouchers, mini-programs, contents, and videos. 2) Apart from\nID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text\nand image features, which can effectively establish connections between items\nwith different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200\nfeatures, including 200 million users and 6 million items. It is currently the\nlargest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several\ntypical CTR tasks and provide comparisons with baseline methods. The dataset\nhomepage is available at https://www.atecup.cn/home.\n","authors":["Zhaoxin Huan","Ke Ding","Ang Li","Xiaolu Zhang","Xu Min","Yong He","Liang Zhang","Jun Zhou","Linjian Mo","Jinjie Gu","Zhongyi Liu","Wenliang Zhong","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16437v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.04726v2","updated":"2023-08-31T17:59:49Z","published":"2023-07-10T17:34:23Z","title":"Diffusion Policies for Out-of-Distribution Generalization in Offline\n Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) methods leverage previous experiences to\nlearn better policies than the behavior policy used for data collection. In\ncontrast to behavior cloning, which assumes the data is collected from expert\ndemonstrations, offline RL can work with non-expert data and multimodal\nbehavior policies. However, offline RL algorithms face challenges in handling\ndistribution shifts and effectively representing policies due to the lack of\nonline interaction during training. Prior work on offline RL uses conditional\ndiffusion models to represent multimodal behavior in the dataset. Nevertheless,\nthese methods are not tailored toward alleviating the out-of-distribution state\ngeneralization. We introduce a novel method, named State Reconstruction for\nDiffusion Policies (SRDP), incorporating state reconstruction feature learning\nin the recent class of diffusion policies to address the out-of-distribution\ngeneralization problem. State reconstruction loss promotes more descriptive\nrepresentation learning of states to alleviate the distribution shift incurred\nby the out-of-distribution (OOD) states. We design a novel 2D Multimodal\nContextual Bandit environment to illustrate the OOD generalization of SRDP\ncompared to prior algorithms. In addition, we assess the performance of our\nmodel on D4RL continuous control benchmarks, namely the navigation of an 8-DoF\nant and forward locomotion of half-cheetah, hopper, and walker2d, achieving\nstate-of-the-art results.\n","authors":["Suzan Ece Ada","Erhan Oztop","Emre Ugur"],"pdf_url":"https://arxiv.org/pdf/2307.04726v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.16904v1","updated":"2023-08-31T17:59:00Z","published":"2023-08-31T17:59:00Z","title":"A Note on Randomized Kaczmarz Algorithm for Solving Doubly-Noisy Linear\n Systems","summary":" Large-scale linear systems, $Ax=b$, frequently arise in practice and demand\neffective iterative solvers. Often, these systems are noisy due to operational\nerrors or faulty data-collection processes. In the past decade, the randomized\nKaczmarz (RK) algorithm has been studied extensively as an efficient iterative\nsolver for such systems. However, the convergence study of RK in the noisy\nregime is limited and considers measurement noise in the right-hand side\nvector, $b$. Unfortunately, in practice, that is not always the case; the\ncoefficient matrix $A$ can also be noisy. In this paper, we analyze the\nconvergence of RK for noisy linear systems when the coefficient matrix, $A$, is\ncorrupted with both additive and multiplicative noise, along with the noisy\nvector, $b$. In our analyses, the quantity $\\tilde R=\\| \\tilde A^{\\dagger}\n\\|_2^2 \\|\\tilde A \\|_F^2$ influences the convergence of RK, where $\\tilde A$\nrepresents a noisy version of $A$. We claim that our analysis is robust and\nrealistically applicable, as we do not require information about the noiseless\ncoefficient matrix, $A$, and considering different conditions on noise, we can\ncontrol the convergence of RK. We substantiate our theoretical findings by\nperforming comprehensive numerical experiments.\n","authors":["El Houcine Bergou","Soumia Boucherouite","Aritra Dutta","Xin Li","Anna Ma"],"pdf_url":"https://arxiv.org/pdf/2308.16904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16900v1","updated":"2023-08-31T17:58:28Z","published":"2023-08-31T17:58:28Z","title":"Learning to Taste: A Multimodal Wine Dataset","summary":" We present WineSensed, a large multimodal wine dataset for studying the\nrelations between visual perception, language, and flavor. The dataset\nencompasses 897k images of wine labels and 824k reviews of wines curated from\nthe Vivino platform. It has over 350k unique vintages, annotated with year,\nregion, rating, alcohol percentage, price, and grape composition. We obtained\nfine-grained flavor annotations on a subset by conducting a wine-tasting\nexperiment with 256 participants who were asked to rank wines based on their\nsimilarity in flavor, resulting in more than 5k pairwise flavor distances. We\npropose a low-dimensional concept embedding algorithm that combines human\nexperience with automatic machine similarity kernels. We demonstrate that this\nshared concept embedding space improves upon separate embedding spaces for\ncoarse flavor classification (alcohol percentage, country, grape, price,\nrating) and aligns with the intricate human perception of flavor.\n","authors":["Thoranna Bender","Simon Møe Sørensen","Alireza Kashani","K. Eldjarn Hjorleifsson","Grethe Hyldig","Søren Hauberg","Serge Belongie","Frederik Warburg"],"pdf_url":"https://arxiv.org/pdf/2308.16900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16898v1","updated":"2023-08-31T17:57:50Z","published":"2023-08-31T17:57:50Z","title":"Transformers as Support Vector Machines","summary":" Since its inception in \"Attention Is All You Need\", transformer architecture\nhas led to revolutionary advancements in NLP. The attention layer within the\ntransformer admits a sequence of input tokens $X$ and makes them interact\nthrough pairwise similarities computed as softmax$(XQK^\\top X^\\top)$, where\n$(K,Q)$ are the trainable key-query parameters. In this work, we establish a\nformal equivalence between the optimization geometry of self-attention and a\nhard-margin SVM problem that separates optimal input tokens from non-optimal\ntokens using linear constraints on the outer-products of token pairs. This\nformalism allows us to characterize the implicit bias of 1-layer transformers\noptimized with gradient descent: (1) Optimizing the attention layer with\nvanishing regularization, parameterized by $(K,Q)$, converges in direction to\nan SVM solution minimizing the nuclear norm of the combined parameter\n$W=KQ^\\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm\nobjective. We characterize this convergence, highlighting that it can occur\ntoward locally-optimal directions rather than global ones. (2) Complementing\nthis, we prove the local/global directional convergence of gradient descent\nunder suitable geometric conditions. Importantly, we show that\nover-parameterization catalyzes global convergence by ensuring the feasibility\nof the SVM problem and by guaranteeing a benign optimization landscape devoid\nof stationary points. (3) While our theory applies primarily to linear\nprediction heads, we propose a more general SVM equivalence that predicts the\nimplicit bias with nonlinear heads. Our findings are applicable to arbitrary\ndatasets and their validity is verified via experiments. We also introduce\nseveral open problems and research directions. We believe these findings\ninspire the interpretation of transformers as a hierarchy of SVMs that\nseparates and selects optimal tokens.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Christos Thrampoulidis","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2308.16898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16896v1","updated":"2023-08-31T17:57:17Z","published":"2023-08-31T17:57:17Z","title":"PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic\n Occupancy Prediction","summary":" Semantic segmentation in autonomous driving has been undergoing an evolution\nfrom sparse point segmentation to dense voxel segmentation, where the objective\nis to predict the semantic occupancy of each voxel in the concerned 3D space.\nThe dense nature of the prediction space has rendered existing efficient\n2D-projection-based methods (e.g., bird's eye view, range view, etc.)\nineffective, as they can only describe a subspace of the 3D scene. To address\nthis, we propose a cylindrical tri-perspective view to represent point clouds\neffectively and comprehensively and a PointOcc model to process them\nefficiently. Considering the distance distribution of LiDAR point clouds, we\nconstruct the tri-perspective view in the cylindrical coordinate system for\nmore fine-grained modeling of nearer areas. We employ spatial group pooling to\nmaintain structural details during projection and adopt 2D backbones to\nefficiently process each TPV plane. Finally, we obtain the features of each\npoint by aggregating its projected features on each of the processed TPV planes\nwithout the need for any post-processing. Extensive experiments on both 3D\noccupancy prediction and LiDAR segmentation benchmarks demonstrate that the\nproposed PointOcc achieves state-of-the-art performance with much faster speed.\nSpecifically, despite only using LiDAR, PointOcc significantly outperforms all\nother methods, including multi-modal methods, with a large margin on the\nOpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc.\n","authors":["Sicheng Zuo","Wenzhao Zheng","Yuanhui Huang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16896v1.pdf","comment":"Code is available at https://github.com/wzzheng/PointOcc"},{"id":"http://arxiv.org/abs/2308.16893v1","updated":"2023-08-31T17:56:13Z","published":"2023-08-31T17:56:13Z","title":"Language-Conditioned Path Planning","summary":" Contact is at the core of robotic manipulation. At times, it is desired (e.g.\nmanipulation and grasping), and at times, it is harmful (e.g. when avoiding\nobstacles). However, traditional path planning algorithms focus solely on\ncollision-free paths, limiting their applicability in contact-rich tasks. To\naddress this limitation, we propose the domain of Language-Conditioned Path\nPlanning, where contact-awareness is incorporated into the path planning\nproblem. As a first step in this domain, we propose Language-Conditioned\nCollision Functions (LACO) a novel approach that learns a collision function\nusing only a single-view image, language prompt, and robot configuration. LACO\npredicts collisions between the robot and the environment, enabling flexible,\nconditional path planning without the need for manual object annotations, point\ncloud data, or ground-truth object meshes. In both simulation and the real\nworld, we demonstrate that LACO can facilitate complex, nuanced path plans that\nallow for interaction with objects that are safe to collide, rather than\nprohibiting any collision.\n","authors":["Amber Xie","Youngwoon Lee","Pieter Abbeel","Stephen James"],"pdf_url":"https://arxiv.org/pdf/2308.16893v1.pdf","comment":"Conference on Robot Learning, 2023"},{"id":"http://arxiv.org/abs/2308.16891v1","updated":"2023-08-31T17:52:10Z","published":"2023-08-31T17:52:10Z","title":"GNFactor: Multi-Task Real Robot Learning with Generalizable Neural\n Feature Fields","summary":" It is a long-standing problem in robotics to develop agents capable of\nexecuting diverse manipulation tasks from visual observations in unstructured\nreal-world environments. To achieve this goal, the robot needs to have a\ncomprehensive understanding of the 3D structure and semantics of the scene. In\nthis work, we present $\\textbf{GNFactor}$, a visual behavior cloning agent for\nmulti-task robotic manipulation with $\\textbf{G}$eneralizable $\\textbf{N}$eural\nfeature $\\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural\nfield (GNF) as a reconstruction module and a Perceiver Transformer as a\ndecision-making module, leveraging a shared deep 3D voxel representation. To\nincorporate semantics in 3D, the reconstruction module utilizes a\nvision-language foundation model ($\\textit{e.g.}$, Stable Diffusion) to distill\nrich semantic information into the deep 3D voxel. We evaluate GNFactor on 3\nreal robot tasks and perform detailed ablations on 10 RLBench tasks with a\nlimited number of demonstrations. We observe a substantial improvement of\nGNFactor over current state-of-the-art methods in seen and unseen tasks,\ndemonstrating the strong generalization ability of GNFactor. Our project\nwebsite is https://yanjieze.com/GNFactor/ .\n","authors":["Yanjie Ze","Ge Yan","Yueh-Hua Wu","Annabella Macaluso","Yuying Ge","Jianglong Ye","Nicklas Hansen","Li Erran Li","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16891v1.pdf","comment":"CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/"},{"id":"http://arxiv.org/abs/2212.02611v2","updated":"2023-08-31T17:51:08Z","published":"2022-12-05T21:52:12Z","title":"StyleGAN as a Utility-Preserving Face De-identification Method","summary":" Face de-identification methods have been proposed to preserve users' privacy\nby obscuring their faces. These methods, however, can degrade the quality of\nphotos, and they usually do not preserve the utility of faces, i.e., their age,\ngender, pose, and facial expression. Recently, GANs, such as StyleGAN, have\nbeen proposed, which generate realistic, high-quality imaginary faces. In this\npaper, we investigate the use of StyleGAN in generating de-identified faces\nthrough style mixing. We examined this de-identification method for preserving\nutility and privacy by implementing several face detection, verification, and\nidentification attacks and conducting a user study. The results from our\nextensive experiments, human evaluation, and comparison with two\nstate-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN\nperforms on par or better than these methods, preserving users' privacy and\nimages' utility. In particular, the results of the machine learning-based\nexperiments show that StyleGAN0-4 preserves utility better than CIAGAN and\nDeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves\nutility at the same level while providing more privacy. In this paper, for the\nfirst time, we also performed a carefully designed user study to examine both\nprivacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well\nas CIAGAN and DeepPrivacy from the human observers' perspectives. Our\nstatistical tests showed that participants tend to verify and identify\nStyleGAN0-5 images more easily than DeepPrivacy images. All the methods but\nStyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding\nutility, as expected, StyleGAN0-5 performed significantly better in preserving\nsome attributes. Among all methods, on average, participants believe gender has\nbeen preserved the most while naturalness has been preserved the least.\n","authors":["Seyyed Mohammad Sadegh Moosavi Khorzooghi","Shirin Nilizadeh"],"pdf_url":"https://arxiv.org/pdf/2212.02611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16889v1","updated":"2023-08-31T17:50:54Z","published":"2023-08-31T17:50:54Z","title":"Federated Learning in UAV-Enhanced Networks: Joint Coverage and\n Convergence Time Optimization","summary":" Federated learning (FL) involves several devices that collaboratively train a\nshared model without transferring their local data. FL reduces the\ncommunication overhead, making it a promising learning method in UAV-enhanced\nwireless networks with scarce energy resources. Despite the potential,\nimplementing FL in UAV-enhanced networks is challenging, as conventional UAV\nplacement methods that maximize coverage increase the FL delay significantly.\nMoreover, the uncertainty and lack of a priori information about crucial\nvariables, such as channel quality, exacerbate the problem. In this paper, we\nfirst analyze the statistical characteristics of a UAV-enhanced wireless sensor\nnetwork (WSN) with energy harvesting. We then develop a model and solution\nbased on the multi-objective multi-armed bandit theory to maximize the network\ncoverage while minimizing the FL delay. Besides, we propose another solution\nthat is particularly useful with large action sets and strict energy\nconstraints at the UAVs. Our proposal uses a scalarized best-arm identification\nalgorithm to find the optimal arms that maximize the ratio of the expected\nreward to the expected energy cost by sequentially eliminating one or more arms\nin each round. Then, we derive the upper bound on the error probability of our\nmulti-objective and cost-aware algorithm. Numerical results show the\neffectiveness of our approach.\n","authors":["Mariam Yahya","Setareh Maghsudi","Slawomir Stanczak"],"pdf_url":"https://arxiv.org/pdf/2308.16889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16886v1","updated":"2023-08-31T17:45:34Z","published":"2023-08-31T17:45:34Z","title":"Prediction of Diblock Copolymer Morphology via Machine Learning","summary":" A machine learning approach is presented to accelerate the computation of\nblock polymer morphology evolution for large domains over long timescales. The\nstrategy exploits the separation of characteristic times between coarse-grained\nparticle evolution on the monomer scale and slow morphological evolution over\nmesoscopic scales. In contrast to empirical continuum models, the proposed\napproach learns stochastically driven defect annihilation processes directly\nfrom particle-based simulations. A UNet architecture that respects different\nboundary conditions is adopted, thereby allowing periodic and fixed substrate\nboundary conditions of arbitrary shape. Physical concepts are also introduced\nvia the loss function and symmetries are incorporated via data augmentation.\nThe model is validated using three different use cases. Explainable artificial\nintelligence methods are applied to visualize the morphology evolution over\ntime. This approach enables the generation of large system sizes and long\ntrajectories to investigate defect densities and their evolution under\ndifferent types of confinement. As an application, we demonstrate the\nimportance of accessing late-stage morphologies for understanding particle\ndiffusion inside a single block. This work has implications for directed\nself-assembly and materials design in micro-electronics, battery materials, and\nmembranes.\n","authors":["Hyun Park","Boyuan Yu","Juhae Park","Ge Sun","Emad Tajkhorshid","Juan J. de Pablo","Ludwig Schneider"],"pdf_url":"https://arxiv.org/pdf/2308.16886v1.pdf","comment":"51 page, 11 Figures and 5 figures in the SI"},{"id":"http://arxiv.org/abs/2308.16884v1","updated":"2023-08-31T17:43:08Z","published":"2023-08-31T17:43:08Z","title":"The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122\n Language Variants","summary":" We present Belebele, a multiple-choice machine reading comprehension (MRC)\ndataset spanning 122 language variants. Significantly expanding the language\ncoverage of natural language understanding (NLU) benchmarks, this dataset\nenables the evaluation of text models in high-, medium-, and low-resource\nlanguages. Each question is based on a short passage from the Flores-200\ndataset and has four multiple-choice answers. The questions were carefully\ncurated to discriminate between models with different levels of general\nlanguage comprehension. The English dataset on its own proves difficult enough\nto challenge state-of-the-art language models. Being fully parallel, this\ndataset enables direct comparison of model performance across all languages. We\nuse this dataset to evaluate the capabilities of multilingual masked language\nmodels (MLMs) and large language models (LLMs). We present extensive results\nand find that despite significant cross-lingual transfer in English-centric\nLLMs, much smaller MLMs pretrained on balanced multilingual data still\nunderstand far more languages. We also observe that larger vocabulary size and\nconscious vocabulary construction correlate with better performance on\nlow-resource languages. Overall, Belebele opens up new avenues for evaluating\nand analyzing the multilingual capabilities of NLP systems.\n","authors":["Lucas Bandarkar","Davis Liang","Benjamin Muller","Mikel Artetxe","Satya Narayan Shukla","Donald Husa","Naman Goyal","Abhinandan Krishnan","Luke Zettlemoyer","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2308.16884v1.pdf","comment":"27 pages, 13 figures"},{"id":"http://arxiv.org/abs/2209.03450v2","updated":"2023-08-31T17:38:19Z","published":"2022-09-07T20:11:17Z","title":"Seeking Interpretability and Explainability in Binary Activated Neural\n Networks","summary":" We study the use of binary activated neural networks as interpretable and\nexplainable predictors in the context of regression tasks on tabular data; more\nspecifically, we provide guarantees on their expressiveness, present an\napproach based on the efficient computation of SHAP values for quantifying the\nrelative importance of the features, hidden neurons and even weights. As the\nmodel's simplicity is instrumental in achieving interpretability, we propose a\ngreedy algorithm for building compact binary activated networks. This approach\ndoesn't need to fix an architecture for the network in advance: it is built one\nlayer at a time, one neuron at a time, leading to predictors that aren't\nneedlessly complex for a given task.\n","authors":["Benjamin Leblanc","Pascal Germain"],"pdf_url":"https://arxiv.org/pdf/2209.03450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.02373v2","updated":"2023-08-31T17:12:16Z","published":"2022-10-05T16:30:35Z","title":"Dynamical systems' based neural networks","summary":" Neural networks have gained much interest because of their effectiveness in\nmany applications. However, their mathematical properties are generally not\nwell understood. If there is some underlying geometric structure inherent to\nthe data or to the function to approximate, it is often desirable to take this\ninto account in the design of the neural network. In this work, we start with a\nnon-autonomous ODE and build neural networks using a suitable,\nstructure-preserving, numerical time-discretisation. The structure of the\nneural network is then inferred from the properties of the ODE vector field.\nBesides injecting more structure into the network architectures, this modelling\nprocedure allows a better theoretical understanding of their behaviour. We\npresent two universal approximation results and demonstrate how to impose some\nparticular properties on the neural networks. A particular focus is on\n1-Lipschitz architectures including layers that are not 1-Lipschitz. These\nnetworks are expressive and robust against adversarial attacks, as shown for\nthe CIFAR-10 and CIFAR-100 datasets.\n","authors":["Elena Celledoni","Davide Murari","Brynjulf Owren","Carola-Bibiane Schönlieb","Ferdia Sherry"],"pdf_url":"https://arxiv.org/pdf/2210.02373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16859v1","updated":"2023-08-31T17:03:34Z","published":"2023-08-31T17:03:34Z","title":"Information Theoretically Optimal Sample Complexity of Learning\n Dynamical Directed Acyclic Graphs","summary":" In this article, the optimal sample complexity of learning the underlying\ninteraction/dependencies of a Linear Dynamical System (LDS) over a Directed\nAcyclic Graph (DAG) is studied. The sample complexity of learning a DAG's\nstructure is well-studied for static systems, where the samples of nodal states\nare independent and identically distributed (i.i.d.). However, such a study is\nless explored for DAGs with dynamical systems, where the nodal states are\ntemporally correlated. We call such a DAG underlying an LDS as \\emph{dynamical}\nDAG (DDAG). In particular, we consider a DDAG where the nodal dynamics are\ndriven by unobserved exogenous noise sources that are wide-sense stationary\n(WSS) in time but are mutually uncorrelated, and have the same {power spectral\ndensity (PSD)}. Inspired by the static settings, a metric and an algorithm\nbased on the PSD matrix of the observed time series are proposed to reconstruct\nthe DDAG. The equal noise PSD assumption can be relaxed such that\nidentifiability conditions for DDAG reconstruction are not violated. For the\nLDS with WSS (sub) Gaussian exogenous noise sources, it is shown that the\noptimal sample complexity (or length of state trajectory) needed to learn the\nDDAG is $n=\\Theta(q\\log(p/q))$, where $p$ is the number of nodes and $q$ is the\nmaximum number of parents per node. To prove the sample complexity upper bound,\na concentration bound for the PSD estimation is derived, under two different\nsampling strategies. A matching min-max lower bound using generalized Fano's\ninequality also is provided, thus showing the order optimality of the proposed\nalgorithm.\n","authors":["Mishfad Shaikh Veedu","Deepjyoti Deka","Murti V. Salapaka"],"pdf_url":"https://arxiv.org/pdf/2308.16859v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2308.16858v1","updated":"2023-08-31T17:03:16Z","published":"2023-08-31T17:03:16Z","title":"Majorization-Minimization for sparse SVMs","summary":" Several decades ago, Support Vector Machines (SVMs) were introduced for\nperforming binary classification tasks, under a supervised framework. Nowadays,\nthey often outperform other supervised methods and remain one of the most\npopular approaches in the machine learning arena. In this work, we investigate\nthe training of SVMs through a smooth sparse-promoting-regularized squared\nhinge loss minimization. This choice paves the way to the application of quick\ntraining methods built on majorization-minimization approaches, benefiting from\nthe Lipschitz differentiabililty of the loss function. Moreover, the proposed\napproach allows us to handle sparsity-preserving regularizers promoting the\nselection of the most significant features, so enhancing the performance.\nNumerical tests and comparisons conducted on three different datasets\ndemonstrate the good performance of the proposed methodology in terms of\nqualitative metrics (accuracy, precision, recall, and F 1 score) as well as\ncomputational cost.\n","authors":["Alessandro Benfenati","Emilie Chouzenoux","Giorgia Franchini","Salla Latva-Aijo","Dominik Narnhofer","Jean-Christophe Pesquet","Sebastian J. Scott","Mahsa Yousefi"],"pdf_url":"https://arxiv.org/pdf/2308.16858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14172v2","updated":"2023-08-31T16:57:35Z","published":"2023-08-27T18:28:58Z","title":"Hypergraph Structure Inference From Data Under Smoothness Prior","summary":" Hypergraphs are important for processing data with higher-order relationships\ninvolving more than two entities. In scenarios where explicit hypergraphs are\nnot readily available, it is desirable to infer a meaningful hypergraph\nstructure from the node features to capture the intrinsic relations within the\ndata. However, existing methods either adopt simple pre-defined rules that fail\nto precisely capture the distribution of the potential hypergraph structure, or\nlearn a mapping between hypergraph structures and node features but require a\nlarge amount of labelled data, i.e., pre-existing hypergraph structures, for\ntraining. Both restrict their applications in practical scenarios. To fill this\ngap, we propose a novel smoothness prior that enables us to design a method to\ninfer the probability for each potential hyperedge without labelled data as\nsupervision. The proposed prior indicates features of nodes in a hyperedge are\nhighly correlated by the features of the hyperedge containing them. We use this\nprior to derive the relation between the hypergraph structure and the node\nfeatures via probabilistic modelling. This allows us to develop an unsupervised\ninference method to estimate the probability for each potential hyperedge via\nsolving an optimisation problem that has an analytical solution. Experiments on\nboth synthetic and real-world data demonstrate that our method can learn\nmeaningful hypergraph structures from data more efficiently than existing\nhypergraph structure inference methods.\n","authors":["Bohan Tang","Siheng Chen","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2308.14172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15034v2","updated":"2023-08-31T16:37:28Z","published":"2023-07-27T17:42:06Z","title":"Speeding up Fourier Neural Operators via Mixed Precision","summary":" The Fourier neural operator (FNO) is a powerful technique for learning\nsurrogate maps for partial differential equation (PDE) solution operators. For\nmany real-world applications, which often require high-resolution data points,\ntraining time and memory usage are significant bottlenecks. While there are\nmixed-precision training techniques for standard neural networks, those work\nfor real-valued datatypes on finite dimensions and therefore cannot be directly\napplied to FNO, which crucially operates in the (complex-valued) Fourier domain\nand in function spaces. On the other hand, since the Fourier transform is\nalready an approximation (due to discretization error), we do not need to\nperform the operation at full precision. In this work, we (i) profile memory\nand runtime for FNO with full and mixed-precision training, (ii) conduct a\nstudy on the numerical stability of mixed-precision training of FNO, and (iii)\ndevise a training routine which substantially decreases training time and\nmemory usage (up to 34%), with little or no reduction in accuracy, on the\nNavier-Stokes and Darcy flow equations. Combined with the recently proposed\ntensorized FNO (Kossaifi et al., 2023), the resulting model has far better\nperformance while also being significantly faster than the original FNO.\n","authors":["Colin White","Renbo Tu","Jean Kossaifi","Gennady Pekhimenko","Kamyar Azizzadenesheli","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2307.15034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00752v3","updated":"2023-08-31T16:28:50Z","published":"2023-01-02T16:51:40Z","title":"Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave\n Communications","summary":" This study demonstrates the feasibility of point cloud-based proactive link\nquality prediction for millimeter-wave (mmWave) communications. Previous\nstudies have proposed machine learning-based methods to predict received signal\nstrength for future time periods using time series of depth images to mitigate\nthe line-of-sight (LOS) path blockage by pedestrians in mmWave communication.\nHowever, these image-based methods have limited applicability due to privacy\nconcerns as camera images may contain sensitive information. This study\nproposes a point cloud-based method for mmWave link quality prediction and\ndemonstrates its feasibility through experiments. Point clouds represent\nthree-dimensional (3D) spaces as a set of points and are sparser and less\nlikely to contain sensitive information than camera images. Additionally, point\nclouds provide 3D position and motion information, which is necessary for\nunderstanding the radio propagation environment involving pedestrians. This\nstudy designs the mmWave link quality prediction method and conducts realistic\nindoor experiments, where the link quality fluctuates significantly due to\nhuman blockage, using commercially available IEEE 802.11ad-based 60 GHz\nwireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light\ndetection and ranging (LiDAR) for point cloud acquisition. The experimental\nresults showed that our proposed method can predict future large attenuation of\nmmWave received signal strength and throughput induced by the LOS path blockage\nby pedestrians with comparable or superior accuracy to image-based prediction\nmethods. Hence, our point cloud-based method can serve as a viable alternative\nto image-based methods.\n","authors":["Shoki Ohta","Takayuki Nishio","Riichi Kudo","Kahoko Takahashi","Hisashi Nagata"],"pdf_url":"https://arxiv.org/pdf/2301.00752v3.pdf","comment":"Submitted to IEEE Transactions on Machine Learning in Communications\n and Networking"},{"id":"http://arxiv.org/abs/2308.16848v1","updated":"2023-08-31T16:27:08Z","published":"2023-08-31T16:27:08Z","title":"Natural Quantum Monte Carlo Computation of Excited States","summary":" We present a variational Monte Carlo algorithm for estimating the lowest\nexcited states of a quantum system which is a natural generalization of the\nestimation of ground states. The method has no free parameters and requires no\nexplicit orthogonalization of the different states, instead transforming the\nproblem of finding excited states of a given system into that of finding the\nground state of an expanded system. Expected values of arbitrary observables\ncan be calculated, including off-diagonal expectations between different states\nsuch as the transition dipole moment. Although the method is entirely general,\nit works particularly well in conjunction with recent work on using neural\nnetworks as variational Ansatze for many-electron systems, and we show that by\ncombining this method with the FermiNet and Psiformer Ansatze we can accurately\nrecover vertical excitation energies and oscillator strengths on molecules as\nlarge as benzene. Beyond the examples on molecules presented here, we expect\nthis technique will be of great interest for applications of variational\nquantum Monte Carlo to atomic, nuclear and condensed matter physics.\n","authors":["David Pfau","Simon Axelrod","Halvard Sutterud","Ingrid von Glehn","James S. Spencer"],"pdf_url":"https://arxiv.org/pdf/2308.16848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16847v1","updated":"2023-08-31T16:26:17Z","published":"2023-08-31T16:26:17Z","title":"Diffusion Models for Interferometric Satellite Aperture Radar","summary":" Probabilistic Diffusion Models (PDMs) have recently emerged as a very\npromising class of generative models, achieving high performance in natural\nimage generation. However, their performance relative to non-natural images,\nlike radar-based satellite data, remains largely unknown. Generating large\namounts of synthetic (and especially labelled) satellite data is crucial to\nimplement deep-learning approaches for the processing and analysis of\n(interferometric) satellite aperture radar data. Here, we leverage PDMs to\ngenerate several radar-based satellite image datasets. We show that PDMs\nsucceed in generating images with complex and realistic structures, but that\nsampling time remains an issue. Indeed, accelerated sampling strategies, which\nwork well on simple image datasets like MNIST, fail on our radar datasets. We\nprovide a simple and versatile open-source\nhttps://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and\nevaluate PDMs using any dataset on a single GPU.\n","authors":["Alexandre Tuel","Thomas Kerdreux","Claudia Hulbert","Bertrand Rouet-Leduc"],"pdf_url":"https://arxiv.org/pdf/2308.16847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08761v3","updated":"2023-08-31T16:21:10Z","published":"2023-02-17T08:56:07Z","title":"Metropolitan Segment Traffic Speeds from Massive Floating Car Data in 10\n Cities","summary":" Traffic analysis is crucial for urban operations and planning, while the\navailability of dense urban traffic data beyond loop detectors is still scarce.\nWe present a large-scale floating vehicle dataset of per-street segment traffic\ninformation, Metropolitan Segment Traffic Speeds from Massive Floating Car Data\nin 10 Cities (MeTS-10), available for 10 global cities with a 15-minute\nresolution for collection periods ranging between 108 and 361 days in 2019-2021\nand covering more than 1500 square kilometers per metropolitan area. MeTS-10\nfeatures traffic speed information at all street levels from main arterials to\nlocal streets for Antwerp, Bangkok, Barcelona, Berlin, Chicago, Istanbul,\nLondon, Madrid, Melbourne and Moscow. The dataset leverages the\nindustrial-scale floating vehicle Traffic4cast data with speeds and vehicle\ncounts provided in a privacy-preserving spatio-temporal aggregation. We detail\nthe efficient matching approach mapping the data to the OpenStreetMap road\ngraph. We evaluate the dataset by comparing it with publicly available\nstationary vehicle detector data (for Berlin, London, and Madrid) and the Uber\ntraffic speed dataset (for Barcelona, Berlin, and London). The comparison\nhighlights the differences across datasets in spatio-temporal coverage and\nvariations in the reported traffic caused by the binning method. MeTS-10\nenables novel, city-wide analysis of mobility and traffic patterns for ten\nmajor world cities, overcoming current limitations of spatially sparse vehicle\ndetector data. The large spatial and temporal coverage offers an opportunity\nfor joining the MeTS-10 with other datasets, such as traffic surveys in traffic\nplanning studies or vehicle detector data in traffic control settings.\n","authors":["Moritz Neun","Christian Eichenberger","Yanan Xin","Cheng Fu","Nina Wiedemann","Henry Martin","Martin Tomko","Lukas Ambühl","Luca Hermes","Michael Kopp"],"pdf_url":"https://arxiv.org/pdf/2302.08761v3.pdf","comment":"Accepted by IEEE Transactions on Intelligent Transportation Systems\n (T-ITS), DOI: https://doi.org/10.1109/TITS.2023.3291737"},{"id":"http://arxiv.org/abs/2306.08149v3","updated":"2023-08-31T16:14:05Z","published":"2023-06-13T21:47:30Z","title":"Neural Mixed Effects for Nonlinear Personalized Predictions","summary":" Personalized prediction is a machine learning approach that predicts a\nperson's future observations based on their past labeled observations and is\ntypically used for sequential tasks, e.g., to predict daily mood ratings. When\nmaking personalized predictions, a model can combine two types of trends: (a)\ntrends shared across people, i.e., person-generic trends, such as being happier\non weekends, and (b) unique trends for each person, i.e., person-specific\ntrends, such as a stressful weekly meeting. Mixed effect models are popular\nstatistical models to study both trends by combining person-generic and\nperson-specific parameters. Though linear mixed effect models are gaining\npopularity in machine learning by integrating them with neural networks, these\nintegrations are currently limited to linear person-specific parameters: ruling\nout nonlinear person-specific trends. In this paper, we propose Neural Mixed\nEffect (NME) models to optimize nonlinear person-specific parameters anywhere\nin a neural network in a scalable manner. NME combines the efficiency of neural\nnetwork optimization with nonlinear mixed effects modeling. Empirically, we\nobserve that NME improves performance across six unimodal and multimodal\ndatasets, including a smartphone dataset to predict daily mood and a\nmother-adolescent dataset to predict affective state sequences where half the\nmothers experience at least moderate symptoms of depression. Furthermore, we\nevaluate NME for two model architectures, including for neural conditional\nrandom fields (CRF) to predict affective state sequences where the CRF learns\nnonlinear person-specific temporal transitions between affective states.\nAnalysis of these person-specific transitions on the mother-adolescent dataset\nshows interpretable trends related to the mother's depression symptoms.\n","authors":["Torsten Wörtwein","Nicholas Allen","Lisa B. Sheeber","Randy P. Auerbach","Jeffrey F. Cohn","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2306.08149v3.pdf","comment":"camera-ready version"},{"id":"http://arxiv.org/abs/2308.16835v1","updated":"2023-08-31T16:10:22Z","published":"2023-08-31T16:10:22Z","title":"FedDD: Toward Communication-efficient Federated Learning with\n Differential Parameter Dropout","summary":" Federated Learning (FL) requires frequent exchange of model parameters, which\nleads to long communication delay, especially when the network environments of\nclients vary greatly. Moreover, the parameter server needs to wait for the\nslowest client (i.e., straggler, which may have the largest model size, lowest\ncomputing capability or worst network condition) to upload parameters, which\nmay significantly degrade the communication efficiency. Commonly-used client\nselection methods such as partial client selection would lead to the waste of\ncomputing resources and weaken the generalization of the global model. To\ntackle this problem, along a different line, in this paper, we advocate the\napproach of model parameter dropout instead of client selection, and\naccordingly propose a novel framework of Federated learning scheme with\nDifferential parameter Dropout (FedDD). FedDD consists of two key modules:\ndropout rate allocation and uploaded parameter selection, which will optimize\nthe model parameter uploading ratios tailored to different clients'\nheterogeneous conditions and also select the proper set of important model\nparameters for uploading subject to clients' dropout rate constraints.\nSpecifically, the dropout rate allocation is formulated as a convex\noptimization problem, taking system heterogeneity, data heterogeneity, and\nmodel heterogeneity among clients into consideration. The uploaded parameter\nselection strategy prioritizes on eliciting important parameters for uploading\nto speedup convergence. Furthermore, we theoretically analyze the convergence\nof the proposed FedDD scheme. Extensive performance evaluations demonstrate\nthat the proposed FedDD scheme can achieve outstanding performances in both\ncommunication efficiency and model convergence, and also possesses a strong\ngeneralization capability to data of rare classes.\n","authors":["Zhiying Feng","Xu Chen","Qiong Wu","Wen Wu","Xiaoxi Zhang","Qianyi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.16835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11737v2","updated":"2023-08-31T15:57:37Z","published":"2023-06-14T18:27:39Z","title":"Neural ShDF: Reviving an Efficient and Consistent Mesh Segmentation\n Method","summary":" Partitioning a polygonal mesh into meaningful parts can be challenging. Many\napplications require decomposing such structures for further processing in\ncomputer graphics. In the last decade, several methods were proposed to tackle\nthis problem, at the cost of intensive computational times. Recently, machine\nlearning has proven to be effective for the segmentation task on 3D structures.\nNevertheless, these state-of-the-art methods are often hardly generalizable and\nrequire dividing the learned model into several specific classes of objects to\navoid overfitting. We present a data-driven approach leveraging deep learning\nto encode a mapping function prior to mesh segmentation for multiple\napplications. Our network reproduces a neighborhood map using our knowledge of\nthe \\textsl{Shape Diameter Function} (SDF) method using similarities among\nvertex neighborhoods. Our approach is resolution-agnostic as we downsample the\ninput meshes and query the full-resolution structure solely for neighborhood\ncontributions. Using our predicted SDF values, we can inject the resulting\nstructure into a graph-cut algorithm to generate an efficient and robust mesh\nsegmentation while considerably reducing the required computation times.\n","authors":["Bruno Roy"],"pdf_url":"https://arxiv.org/pdf/2306.11737v2.pdf","comment":"9 pages, 13 figures, and 3 tables. Short paper and poster published\n and presented at SIGGRAPH 2023"},{"id":"http://arxiv.org/abs/2308.16822v1","updated":"2023-08-31T15:52:35Z","published":"2023-08-31T15:52:35Z","title":"Latent Variable Multi-output Gaussian Processes for Hierarchical\n Datasets","summary":" Multi-output Gaussian processes (MOGPs) have been introduced to deal with\nmultiple tasks by exploiting the correlations between different outputs.\nGenerally, MOGPs models assume a flat correlation structure between the\noutputs. However, such a formulation does not account for more elaborate\nrelationships, for instance, if several replicates were observed for each\noutput (which is a typical setting in biological experiments). This paper\nproposes an extension of MOGPs for hierarchical datasets (i.e. datasets for\nwhich the relationships between observations can be represented within a tree\nstructure). Our model defines a tailored kernel function accounting for\nhierarchical structures in the data to capture different levels of correlations\nwhile leveraging the introduction of latent variables to express the underlying\ndependencies between outputs through a dedicated kernel. This latter feature is\nexpected to significantly improve scalability as the number of tasks increases.\nAn extensive experimental study involving both synthetic and real-world data\nfrom genomics and motion capture is proposed to support our claims.\n","authors":["Chunchao Ma","Arthur Leroy","Mauricio Alvarez"],"pdf_url":"https://arxiv.org/pdf/2308.16822v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.16818v1","updated":"2023-08-31T15:49:21Z","published":"2023-08-31T15:49:21Z","title":"Irregular Traffic Time Series Forecasting Based on Asynchronous\n Spatio-Temporal Graph Convolutional Network","summary":" Accurate traffic forecasting at intersections governed by intelligent traffic\nsignals is critical for the advancement of an effective intelligent traffic\nsignal control system. However, due to the irregular traffic time series\nproduced by intelligent intersections, the traffic forecasting task becomes\nmuch more intractable and imposes three major new challenges: 1) asynchronous\nspatial dependency, 2) irregular temporal dependency among traffic data, and 3)\nvariable-length sequence to be predicted, which severely impede the performance\nof current traffic forecasting methods. To this end, we propose an Asynchronous\nSpatio-tEmporal graph convolutional nEtwoRk (ASeer) to predict the traffic\nstates of the lanes entering intelligent intersections in a future time window.\nSpecifically, by linking lanes via a traffic diffusion graph, we first propose\nan Asynchronous Graph Diffusion Network to model the asynchronous spatial\ndependency between the time-misaligned traffic state measurements of lanes.\nAfter that, to capture the temporal dependency within irregular traffic state\nsequence, a learnable personalized time encoding is devised to embed the\ncontinuous time for each lane. Then we propose a Transformable Time-aware\nConvolution Network that learns meta-filters to derive time-aware convolution\nfilters with transformable filter sizes for efficient temporal convolution on\nthe irregular sequence. Furthermore, a Semi-Autoregressive Prediction Network\nconsisting of a state evolution unit and a semiautoregressive predictor is\ndesigned to effectively and efficiently predict variable-length traffic state\nsequences. Extensive experiments on two real-world datasets demonstrate the\neffectiveness of ASeer in six metrics.\n","authors":["Weijia Zhang","Le Zhang","Jindong Han","Hao Liu","Jingbo Zhou","Yu Mei","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.16818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09312v2","updated":"2023-08-31T15:32:01Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks, such as Reddit discussions. In contrast to traditional comment-only\nmethods, our approach to labelling a comment as hate speech involves a holistic\nanalysis of text and images grounded in the discussion context. This is done by\nleveraging graph transformers to capture the contextual relationships in the\nentire discussion surrounding a comment and grounding the interwoven fusion\nlayers that combine individual comments' text and image embeddings instead of\nprocessing modalities separately. We compare the performance of our model to\nbaselines that only process individual comments and conduct extensive ablation\nstudies. To evaluate our work, we present a new dataset, HatefulDiscussions,\ncomprising complete multi-modal discussions from multiple online communities on\nReddit. We conclude with future work for multimodal solutions to deliver social\nvalue in online contexts, arguing that capturing a holistic view of a\nconversation significantly advances the effort to detect anti-social behaviour.\n","authors":["Liam Hebert","Gaurav Sahu","Yuxuan Guo","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2308.16800v1","updated":"2023-08-31T15:22:31Z","published":"2023-08-31T15:22:31Z","title":"Rank Collapse Causes Over-Smoothing and Over-Correlation in Graph Neural\n Networks","summary":" Our study reveals new theoretical insights into over-smoothing and feature\nover-correlation in deep graph neural networks. We show the prevalence of\ninvariant subspaces, demonstrating a fixed relative behavior that is unaffected\nby feature transformations. Our work clarifies recent observations related to\nconvergence to a constant state and a potential over-separation of node states,\nas the amplification of subspaces only depends on the spectrum of the\naggregation function. In linear scenarios, this leads to node representations\nbeing dominated by a low-dimensional subspace with an asymptotic convergence\nrate independent of the feature transformations. This causes a rank collapse of\nthe node representations, resulting in over-smoothing when smooth vectors span\nthis subspace, and over-correlation even when over-smoothing is avoided. Guided\nby our theory, we propose a sum of Kronecker products as a beneficial property\nthat can provably prevent over-smoothing, over-correlation, and rank collapse.\nWe empirically extend our insights to the non-linear case, demonstrating the\ninability of existing models to capture linearly independent features.\n","authors":["Andreas Roth","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2308.16800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14388v2","updated":"2023-08-31T15:13:28Z","published":"2023-08-28T08:07:57Z","title":"Biclustering Methods via Sparse Penalty","summary":" In this paper, we first reviewed several biclustering methods that are used\nto identify the most significant clusters in gene expression data. Here we\nmainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty\nnamed \"Prenet penalty\" which has been used only in factor analysis to gain\nsparsity. Then in the simulation study, we tried different types of generated\ndatasets (with different sparsity and dimension) and tried 1-layer\napproximation then for k-layers which shows the mixed Prenet penalty is very\neffective for non-overlapped data. Finally, we used some real gene expression\ndata to show the behavior of our methods.\n","authors":["Jiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14388v2.pdf","comment":"This research it still in progress and need to fix some issues"},{"id":"http://arxiv.org/abs/2106.14052v2","updated":"2023-08-31T15:10:45Z","published":"2021-06-26T16:05:44Z","title":"Combining Inductive and Deductive Reasoning for Query Answering over\n Incomplete Knowledge Graphs","summary":" Current methods for embedding-based query answering over incomplete Knowledge\nGraphs (KGs) only focus on inductive reasoning, i.e., predicting answers by\nlearning patterns from the data, and lack the complementary ability to do\ndeductive reasoning, which requires the application of domain knowledge to\ninfer further information. To address this shortcoming, we investigate the\nproblem of incorporating ontologies into embedding-based query answering models\nby defining the task of embedding-based ontology-mediated query answering. We\npropose various integration strategies into prominent representatives of\nembedding models that involve (1) different ontology-driven data augmentation\ntechniques and (2) adaptation of the loss function to enforce the ontology\naxioms. We design novel benchmarks for the considered task based on the LUBM\nand the NELL KGs and evaluate our methods on them. The achieved improvements in\nthe setting that requires both inductive and deductive reasoning are from 20%\nto 55% in HITS@3.\n","authors":["Medina Andresel","Trung-Kien Tran","Csaba Domokos","Pasquale Minervini","Daria Stepanova"],"pdf_url":"https://arxiv.org/pdf/2106.14052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16789v1","updated":"2023-08-31T15:04:28Z","published":"2023-08-31T15:04:28Z","title":"Joint Semantic-Native Communication and Inference via Minimal Simplicial\n Structures","summary":" In this work, we study the problem of semantic communication and inference,\nin which a student agent (i.e. mobile device) queries a teacher agent (i.e.\ncloud sever) to generate higher-order data semantics living in a simplicial\ncomplex. Specifically, the teacher first maps its data into a k-order\nsimplicial complex and learns its high-order correlations. For effective\ncommunication and inference, the teacher seeks minimally sufficient and\ninvariant semantic structures prior to conveying information. These minimal\nsimplicial structures are found via judiciously removing simplices selected by\nthe Hodge Laplacians without compromising the inference query accuracy.\nSubsequently, the student locally runs its own set of queries based on a masked\nsimplicial convolutional autoencoder (SCAE) leveraging both local and remote\nteacher's knowledge. Numerical results corroborate the effectiveness of the\nproposed approach in terms of improving inference query accuracy under\ndifferent channel conditions and simplicial structures. Experiments on a\ncoauthorship dataset show that removing simplices by ranking the Laplacian\nvalues yields a 85% reduction in payload size without sacrificing accuracy.\nJoint semantic communication and inference by masked SCAE improves query\naccuracy by 25% compared to local student based query and 15% compared to\nremote teacher based query. Finally, incorporating channel semantics is shown\nto effectively improve inference accuracy, notably at low SNR values.\n","authors":["Qiyang Zhao","Hang Zou","Mehdi Bennis","Merouane Debbah","Ebtesam Almazrouei","Faouzi Bader"],"pdf_url":"https://arxiv.org/pdf/2308.16789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16781v1","updated":"2023-08-31T14:59:32Z","published":"2023-08-31T14:59:32Z","title":"StratMed: Relevance Stratification for Low-resource Medication\n Recommendation","summary":" With the growing imbalance between limited medical resources and escalating\ndemands, AI-based clinical tasks have become paramount. Medication\nrecommendation, as a sub-domain, aims to amalgamate longitudinal patient\nhistory with medical knowledge, assisting physicians in prescribing safer and\nmore accurate medication combinations. Existing methods overlook the inherent\nlong-tail distribution in medical data, lacking balanced representation between\nhead and tail data, which leads to sub-optimal model performance. To address\nthis challenge, we introduce StratMed, a model that incorporates an innovative\nrelevance stratification mechanism. It harmonizes discrepancies in data\nlong-tail distribution and strikes a balance between the safety and accuracy of\nmedication combinations. Specifically, we first construct a pre-training method\nusing deep learning networks to obtain entity representation. After that, we\ndesign a pyramid-like data stratification method to obtain more generalized\nentity relationships by reinforcing the features of unpopular entities. Based\non this relationship, we designed two graph structures to express medication\nprecision and safety at the same level to obtain visit representations.\nFinally, the patient's historical clinical information is fitted to generate\nmedication combinations for the current health condition. Experiments on the\nMIMIC-III dataset demonstrate that our method has outperformed current\nstate-of-the-art methods in four evaluation metrics (including safety and\naccuracy).\n","authors":["Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.16781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16775v1","updated":"2023-08-31T14:54:06Z","published":"2023-08-31T14:54:06Z","title":"Efficacy of Neural Prediction-Based NAS for Zero-Shot NAS Paradigm","summary":" In prediction-based Neural Architecture Search (NAS), performance indicators\nderived from graph convolutional networks have shown significant success. These\nindicators, achieved by representing feed-forward structures as component\ngraphs through one-hot encoding, face a limitation: their inability to evaluate\narchitecture performance across varying search spaces. In contrast, handcrafted\nperformance indicators (zero-shot NAS), which use the same architecture with\nrandom initialization, can generalize across multiple search spaces. Addressing\nthis limitation, we propose a novel approach for zero-shot NAS using deep\nlearning. Our method employs Fourier sum of sines encoding for convolutional\nkernels, enabling the construction of a computational feed-forward graph with a\nstructure similar to the architecture under evaluation. These encodings are\nlearnable and offer a comprehensive view of the architecture's topological\ninformation. An accompanying multi-layer perceptron (MLP) then ranks these\narchitectures based on their encodings. Experimental results show that our\napproach surpasses previous methods using graph convolutional networks in terms\nof correlation on the NAS-Bench-201 dataset and exhibits a higher convergence\nrate. Moreover, our extracted feature representation trained on each\nNAS-Benchmark is transferable to other NAS-Benchmarks, showing promising\ngeneralizability across multiple search spaces. The code is available at:\nhttps://github.com/minh1409/DFT-NPZS-NAS\n","authors":["Minh Le","Nhan Nguyen","Ngoc Hoang Luong"],"pdf_url":"https://arxiv.org/pdf/2308.16775v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.17670v2","updated":"2023-08-31T14:53:15Z","published":"2023-06-30T14:01:53Z","title":"Learning Delays in Spiking Neural Networks using Dilated Convolutions\n with Learnable Spacings","summary":" Spiking Neural Networks (SNNs) are a promising research direction for\nbuilding power-efficient information processing systems, especially for\ntemporal tasks such as speech recognition. In SNNs, delays refer to the time\nneeded for one spike to travel from one neuron to another. These delays matter\nbecause they influence the spike arrival times, and it is well-known that\nspiking neurons respond more strongly to coincident input spikes. More\nformally, it has been shown theoretically that plastic delays greatly increase\nthe expressivity in SNNs. Yet, efficient algorithms to learn these delays have\nbeen lacking. Here, we propose a new discrete-time algorithm that addresses\nthis issue in deep feedforward SNNs using backpropagation, in an offline\nmanner. To simulate delays between consecutive layers, we use 1D convolutions\nacross time. The kernels contain only a few non-zero weights - one per synapse\n- whose positions correspond to the delays. These positions are learned\ntogether with the weights using the recently proposed Dilated Convolution with\nLearnable Spacings (DCLS). We evaluated our method on three datasets: the\nSpiking Heidelberg Dataset (SHD), the Spiking Speech Commands (SSC) and its\nnon-spiking version Google Speech Commands v0.02 (GSC) benchmarks, which\nrequire detecting temporal patterns. We used feedforward SNNs with two or three\nhidden fully connected layers, and vanilla leaky integrate-and fire neurons. We\nshowed that fixed random delays help and that learning them helps even more.\nFurthermore, our method outperformed the state-of-the-art in the three datasets\nwithout using recurrent connections and with substantially fewer parameters.\nOur work demonstrates the potential of delay learning in developing accurate\nand precise models for temporal data processing. Our code is based on PyTorch /\nSpikingJelly and available at: https://github.com/Thvnvtos/SNN-delays\n","authors":["Ilyass Hammouamri","Ismail Khalfaoui-Hassani","Timothée Masquelier"],"pdf_url":"https://arxiv.org/pdf/2306.17670v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00049v3","updated":"2023-08-31T14:38:57Z","published":"2023-01-31T19:33:14Z","title":"Transformers Meet Directed Graphs","summary":" Transformers were originally proposed as a sequence-to-sequence model for\ntext but have become vital for a wide range of modalities, including images,\naudio, video, and undirected graphs. However, transformers for directed graphs\nare a surprisingly underexplored topic, despite their applicability to\nubiquitous domains, including source code and logic circuits. In this work, we\npropose two direction- and structure-aware positional encodings for directed\ngraphs: (1) the eigenvectors of the Magnetic Laplacian - a direction-aware\ngeneralization of the combinatorial Laplacian; (2) directional random walk\nencodings. Empirically, we show that the extra directionality information is\nuseful in various downstream tasks, including correctness testing of sorting\nnetworks and source code understanding. Together with a data-flow-centric graph\nconstruction, our model outperforms the prior state of the art on the Open\nGraph Benchmark Code2 relatively by 14.7%.\n","authors":["Simon Geisler","Yujia Li","Daniel Mankowitz","Ali Taylan Cemgil","Stephan Günnemann","Cosmin Paduraru"],"pdf_url":"https://arxiv.org/pdf/2302.00049v3.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.16759v1","updated":"2023-08-31T14:27:36Z","published":"2023-08-31T14:27:36Z","title":"Constructing Indoor Region-based Radio Map without Location Labels","summary":" Radio map construction requires a large amount of radio measurement data with\nlocation labels, which imposes a high deployment cost. This paper develops a\nregion-based radio map from received signal strength (RSS) measurements without\nlocation labels. The construction is based on a set of blindly collected RSS\nmeasurement data from a device that visits each region in an indoor area\nexactly once, where the footprints and timestamps are not recorded. The main\nchallenge is to cluster the RSS data and match clusters with the physical\nregions. Classical clustering algorithms fail to work as the RSS data naturally\nappears as non-clustered due to multipaths and noise. In this paper, a signal\nsubspace model with a sequential prior is constructed for the RSS data, and an\nintegrated segmentation and clustering algorithm is developed, which is shown\nto find the globally optimal solution in a special case. Furthermore, the\nclustered data is matched with the physical regions using a graph-based\napproach. Based on real measurements from an office space, the proposed scheme\nreduces the region localization error by roughly 50% compared to a weighted\ncentroid localization (WCL) baseline, and it even outperforms some supervised\nlocalization schemes, including k-nearest neighbor (KNN), support vector\nmachine (SVM), and deep neural network (DNN), which require labeled data for\ntraining.\n","authors":["Zheng Xing","Junting Chen"],"pdf_url":"https://arxiv.org/pdf/2308.16759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16754v1","updated":"2023-08-31T14:21:40Z","published":"2023-08-31T14:21:40Z","title":"Training Neural Networks Using Reproducing Kernel Space Interpolation\n and Model Reduction","summary":" We introduce and study the theory of training neural networks using\ninterpolation techniques from reproducing kernel Hilbert space theory. We\ngeneralize the method to Krein spaces, and show that widely-used neural network\narchitectures are subsets of reproducing kernel Krein spaces (RKKS). We study\nthe concept of \"associated Hilbert spaces\" of RKKS and develop techniques to\nimprove upon the expressivity of various activation functions. Next, using\nconcepts from the theory of functions of several complex variables, we prove a\ncomputationally applicable, multidimensional generalization of the celebrated\nAdamjan- Arov-Krein (AAK) theorem. The theorem yields a novel class of neural\nnetworks, called Prolongation Neural Networks (PNN). We demonstrate that, by\napplying the multidimensional AAK theorem to gain a PNN, one can gain\nperformance superior to both our interpolatory methods and current\nstate-of-the-art methods in noisy environments. We provide useful illustrations\nof our methods in practice.\n","authors":["Eric Arthur Werneburg"],"pdf_url":"https://arxiv.org/pdf/2308.16754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16752v1","updated":"2023-08-31T14:16:30Z","published":"2023-08-31T14:16:30Z","title":"Moreau Envelope ADMM for Decentralized Weakly Convex Optimization","summary":" This paper proposes a proximal variant of the alternating direction method of\nmultipliers (ADMM) for distributed optimization. Although the current versions\nof ADMM algorithm provide promising numerical results in producing solutions\nthat are close to optimal for many convex and non-convex optimization problems,\nit remains unclear if they can converge to a stationary point for weakly convex\nand locally non-smooth functions. Through our analysis using the Moreau\nenvelope function, we demonstrate that MADM can indeed converge to a stationary\npoint under mild conditions. Our analysis also includes computing the bounds on\nthe amount of change in the dual variable update step by relating the gradient\nof the Moreau envelope function to the proximal function. Furthermore, the\nresults of our numerical experiments indicate that our method is faster and\nmore robust than widely-used approaches.\n","authors":["Reza Mirzaeifard","Naveen K. D. Venkategowda","Alexander Jung","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2308.16752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16738v1","updated":"2023-08-31T13:54:57Z","published":"2023-08-31T13:54:57Z","title":"US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for\n Cervical Lymph Node Lesions Diagnoses in Ultrasound Images","summary":" Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph\nnode lesions. However, the diagnoses of these images largely hinge on the\nexpertise of medical practitioners, rendering the process susceptible to\nmisdiagnoses. Although rapidly developing deep learning has substantially\nimproved the diagnoses of diverse ultrasound images, there remains a\nconspicuous research gap concerning cervical lymph nodes. The objective of our\nwork is to accurately diagnose cervical lymph node lesions by leveraging a deep\nlearning model. To this end, we first collected 3392 images containing normal\nlymph nodes, benign lymph node lesions, malignant primary lymph node lesions,\nand malignant metastatic lymph node lesions. Given that ultrasound images are\ngenerated by the reflection and scattering of sound waves across varied bodily\ntissues, we proposed the Conv-FFT Block. It integrates convolutional operations\nwith the fast Fourier transform to more astutely model the images. Building\nupon this foundation, we designed a novel architecture, named US-SFNet. This\narchitecture not only discerns variances in ultrasound images from the spatial\ndomain but also adeptly captures microstructural alterations across various\nlesions in the frequency domain. To ascertain the potential of US-SFNet, we\nbenchmarked it against 12 popular architectures through five-fold\ncross-validation. The results show that US-SFNet is SOTA and can achieve 92.89%\naccuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity,\nrespectively.\n","authors":["Yubiao Yue","Jun Xue","Haihua Liang","Bingchun Luo","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.16738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16737v1","updated":"2023-08-31T13:54:37Z","published":"2023-08-31T13:54:37Z","title":"Robust Networked Federated Learning for Localization","summary":" This paper addresses the problem of localization, which is inherently\nnon-convex and non-smooth in a federated setting where the data is distributed\nacross a multitude of devices. Due to the decentralized nature of federated\nenvironments, distributed learning becomes essential for scalability and\nadaptability. Moreover, these environments are often plagued by outlier data,\nwhich presents substantial challenges to conventional methods, particularly in\nmaintaining estimation accuracy and ensuring algorithm convergence. To mitigate\nthese challenges, we propose a method that adopts an $L_1$-norm robust\nformulation within a distributed sub-gradient framework, explicitly designed to\nhandle these obstacles. Our approach addresses the problem in its original\nform, without resorting to iterative simplifications or approximations,\nresulting in enhanced computational efficiency and improved estimation\naccuracy. We demonstrate that our method converges to a stationary point,\nhighlighting its effectiveness and reliability. Through numerical simulations,\nwe confirm the superior performance of our approach, notably in outlier-rich\nenvironments, which surpasses existing state-of-the-art localization methods.\n","authors":["Reza Mirzaeifard","Naveen K. D. Venkategowda","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2308.16737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10283v2","updated":"2023-08-31T13:47:57Z","published":"2023-08-20T14:36:45Z","title":"Adaptive Uncertainty-Guided Model Selection for Data-Driven PDE\n Discovery","summary":" We propose a new parameter-adaptive uncertainty-penalized Bayesian\ninformation criterion (UBIC) to prioritize the parsimonious partial\ndifferential equation (PDE) that sufficiently governs noisy spatial-temporal\nobserved data with few reliable terms. Since the naive use of the BIC for model\nselection has been known to yield an undesirable overfitted PDE, the UBIC\npenalizes the found PDE not only by its complexity but also the quantified\nuncertainty, derived from the model supports' coefficient of variation in a\nprobabilistic view. We also introduce physics-informed neural network learning\nas a simulation-based approach to further validate the selected PDE flexibly\nagainst the other discovered PDE. Numerical results affirm the successful\napplication of the UBIC in identifying the true governing PDE. Additionally, we\nreveal an interesting effect of denoising the observed data on improving the\ntrade-off between the BIC score and model complexity. Code is available at\nhttps://github.com/Pongpisit-Thanasutives/UBIC.\n","authors":["Pongpisit Thanasutives","Takashi Morita","Masayuki Numao","Ken-ichi Fukui"],"pdf_url":"https://arxiv.org/pdf/2308.10283v2.pdf","comment":"17 pages, 15 figures"},{"id":"http://arxiv.org/abs/2308.16718v1","updated":"2023-08-31T13:37:28Z","published":"2023-08-31T13:37:28Z","title":"Robust Representation Learning for Unreliable Partial Label Learning","summary":" Partial Label Learning (PLL) is a type of weakly supervised learning where\neach training instance is assigned a set of candidate labels, but only one\nlabel is the ground-truth. However, this idealistic assumption may not always\nhold due to potential annotation inaccuracies, meaning the ground-truth may not\nbe present in the candidate label set. This is known as Unreliable Partial\nLabel Learning (UPLL) that introduces an additional complexity due to the\ninherent unreliability and ambiguity of partial labels, often resulting in a\nsub-optimal performance with existing methods. To address this challenge, we\npropose the Unreliability-Robust Representation Learning framework (URRL) that\nleverages unreliability-robust contrastive learning to help the model fortify\nagainst unreliable partial labels effectively. Concurrently, we propose a dual\nstrategy that combines KNN-based candidate label set correction and\nconsistency-regularization-based label disambiguation to refine label quality\nand enhance the ability of representation learning within the URRL framework.\nExtensive experiments demonstrate that the proposed method outperforms\nstate-of-the-art PLL methods on various datasets with diverse degrees of\nunreliability and ambiguity. Furthermore, we provide a theoretical analysis of\nour approach from the perspective of the expectation maximization (EM)\nalgorithm. Upon acceptance, we pledge to make the code publicly accessible.\n","authors":["Yu Shi","Dong-Dong Wu","Xin Geng","Min-Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00241v2","updated":"2023-08-31T13:36:21Z","published":"2023-04-29T11:46:53Z","title":"When Deep Learning Meets Polyhedral Theory: A Survey","summary":" In the past decade, deep learning became the prevalent methodology for\npredictive modeling thanks to the remarkable accuracy of deep neural networks\nin tasks such as computer vision and natural language processing. Meanwhile,\nthe structure of neural networks converged back to simpler representations\nbased on piecewise constant and piecewise linear functions such as the\nRectified Linear Unit (ReLU), which became the most commonly used type of\nactivation function in neural networks. That made certain types of network\nstructure $\\unicode{x2014}$such as the typical fully-connected feedforward\nneural network$\\unicode{x2014}$ amenable to analysis through polyhedral theory\nand to the application of methodologies such as Linear Programming (LP) and\nMixed-Integer Linear Programming (MILP) for a variety of purposes. In this\npaper, we survey the main topics emerging from this fast-paced area of work,\nwhich bring a fresh perspective to understanding neural networks in more detail\nas well as to applying linear optimization techniques to train, verify, and\nreduce the size of such networks.\n","authors":["Joey Huchette","Gonzalo Muñoz","Thiago Serra","Calvin Tsay"],"pdf_url":"https://arxiv.org/pdf/2305.00241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14505v2","updated":"2023-08-31T13:10:04Z","published":"2023-04-03T11:45:27Z","title":"Transformer-based interpretable multi-modal data fusion for skin lesion\n classification","summary":" A lot of deep learning (DL) research these days is mainly focused on\nimproving quantitative metrics regardless of other factors. In human-centered\napplications, like skin lesion classification in dermatology, DL-driven\nclinical decision support systems are still in their infancy due to the limited\ntransparency of their decision-making process. Moreover, the lack of procedures\nthat can explain the behavior of trained DL algorithms leads to almost no trust\nfrom clinical physicians. To diagnose skin lesions, dermatologists rely on\nvisual assessment of the disease and the data gathered from the patient's\nanamnesis. Data-driven algorithms dealing with multi-modal data are limited by\nthe separation of feature-level and decision-level fusion procedures required\nby convolutional architectures. To address this issue, we enable single-stage\nmulti-modal data fusion via the attention mechanism of transformer-based\narchitectures to aid in diagnosing skin diseases. Our method beats other\nstate-of-the-art single- and multi-modal DL architectures in image-rich and\npatient-data-rich environments. Additionally, the choice of the architecture\nenables native interpretability support for the classification task both in the\nimage and metadata domain with no additional modifications necessary.\n","authors":["Theodor Cheslerean-Boghiu","Melia-Evelina Fleischmann","Theresa Willem","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2304.14505v2.pdf","comment":"Submitted to IEEE JBHI in July 2023"},{"id":"http://arxiv.org/abs/2212.14424v2","updated":"2023-08-31T13:06:03Z","published":"2022-12-29T18:55:00Z","title":"Invertible normalizing flow neural networks by JKO scheme","summary":" Normalizing flow is a class of deep generative models for efficient sampling\nand density estimation. In practice, the flow often appears as a chain of\ninvertible neural network blocks; to facilitate training, existing works have\nregularized flow trajectories and designed special network architectures. The\ncurrent paper develops a neural ODE flow network inspired by the\nJordan-Kinderleherer-Otto (JKO) scheme, which allows efficient block-wise\ntraining of the residual blocks without sampling SDE trajectories or inner\nloops of score matching or variational learning. As the JKO scheme unfolds the\ndynamic of gradient flow, the proposed model naturally stacks residual network\nblocks one by one, reducing the memory load and difficulty in performing\nend-to-end deep flow network training. We also develop adaptive time\nreparameterization of the flow network with a progressive refinement of the\ntrajectory in probability space, which improves the model training efficiency\nand accuracy in practice. Using numerical experiments with synthetic and real\ndata, we show that the proposed JKO-iFlow model achieves similar or better\nperformance in generating new samples compared with the existing flow and\ndiffusion models at a significantly reduced computational and memory cost.\n","authors":["Chen Xu","Xiuyuan Cheng","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2212.14424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02329v3","updated":"2023-08-31T12:44:58Z","published":"2023-07-05T14:39:47Z","title":"Data-driven Predictive Latency for 5G: A Theoretical and Experimental\n Analysis Using Network Measurements","summary":" The advent of novel 5G services and applications with binding latency\nrequirements and guaranteed Quality of Service (QoS) hastened the need to\nincorporate autonomous and proactive decision-making in network management\nprocedures. The objective of our study is to provide a thorough analysis of\npredictive latency within 5G networks by utilizing real-world network data that\nis accessible to mobile network operators (MNOs). In particular, (i) we present\nan analytical formulation of the user-plane latency as a Hypoexponential\ndistribution, which is validated by means of a comparative analysis with\nempirical measurements, and (ii) we conduct experimental results of\nprobabilistic regression, anomaly detection, and predictive forecasting\nleveraging on emerging domains in Machine Learning (ML), such as Bayesian\nLearning (BL) and Machine Learning on Graphs (GML). We test our predictive\nframework using data gathered from scenarios of vehicular mobility, dense-urban\ntraffic, and social gathering events. Our results provide valuable insights\ninto the efficacy of predictive algorithms in practical applications.\n","authors":["Marco Skocaj","Francesca Conserva","Nicol Sarcone Grande","Andrea Orsi","Davide Micheli","Giorgio Ghinamo","Simone Bizzarri","Roberto Verdone"],"pdf_url":"https://arxiv.org/pdf/2307.02329v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11463v2","updated":"2023-08-31T12:42:43Z","published":"2023-05-19T06:33:57Z","title":"Generative Sliced MMD Flows with Riesz Kernels","summary":" Maximum mean discrepancy (MMD) flows suffer from high computational costs in\nlarge scale computations. In this paper, we show that MMD flows with Riesz\nkernels $K(x,y) = - \\Vert x-y\\Vert^r$, $r \\in (0,2)$ have exceptional\nproperties which allow their efficient computation. We prove that the MMD of\nRiesz kernels coincides with the MMD of their sliced version. As a consequence,\nthe computation of gradients of MMDs can be performed in the one-dimensional\nsetting. Here, for $r=1$, a simple sorting algorithm can be applied to reduce\nthe complexity from $O(MN+N^2)$ to $O((M+N)\\log(M+N))$ for two measures with\n$M$ and $N$ support points. As another interesting follow-up result, the MMD of\ncompactly supported measures can be estimated from above and below by the\nWasserstein-1 distance. For the implementations we approximate the gradient of\nthe sliced MMD by using only a finite number $P$ of slices. We show that the\nresulting error has complexity $O(\\sqrt{d/P})$, where $d$ is the data\ndimension. These results enable us to train generative models by approximating\nMMD gradient flows by neural networks even for image applications. We\ndemonstrate the efficiency of our model by image generation on MNIST,\nFashionMNIST and CIFAR10.\n","authors":["Johannes Hertrich","Christian Wald","Fabian Altekrüger","Paul Hagemann"],"pdf_url":"https://arxiv.org/pdf/2305.11463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.08060v2","updated":"2023-08-31T12:41:13Z","published":"2021-12-15T11:55:11Z","title":"Leveraging Image-based Generative Adversarial Networks for Time Series\n Generation","summary":" Generative models for images have gained significant attention in computer\nvision and natural language processing due to their ability to generate\nrealistic samples from complex data distributions. To leverage the advances of\nimage-based generative models for the time series domain, we propose a\ntwo-dimensional image representation for time series, the Extended\nIntertemporal Return Plot (XIRP). Our approach captures the intertemporal time\nseries dynamics in a scale-invariant and invertible way, reducing training time\nand improving sample quality. We benchmark synthetic XIRPs obtained by an\noff-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image\nrepresentations and models regarding similarity and predictive ability metrics.\nOur novel, validated image representation for time series consistently and\nsignificantly outperforms a state-of-the-art RNN-based generative model\nregarding predictive ability. Further, we introduce an improved stochastic\ninversion to substantially improve simulation quality regardless of the\nrepresentation and provide the prospect of transfer potentials in other\ndomains.\n","authors":["Justin Hellermann","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2112.08060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16684v1","updated":"2023-08-31T12:38:29Z","published":"2023-08-31T12:38:29Z","title":"Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor\n Attack","summary":" The vulnerabilities to backdoor attacks have recently threatened the\ntrustworthiness of machine learning models in practical applications.\nConventional wisdom suggests that not everyone can be an attacker since the\nprocess of designing the trigger generation algorithm often involves\nsignificant effort and extensive experimentation to ensure the attack's\nstealthiness and effectiveness. Alternatively, this paper shows that there\nexists a more severe backdoor threat: anyone can exploit an easily-accessible\nalgorithm for silent backdoor attacks. Specifically, this attacker can employ\nthe widely-used lossy image compression from a plethora of compression tools to\neffortlessly inject a trigger pattern into an image without leaving any\nnoticeable trace; i.e., the generated triggers are natural artifacts. One does\nnot require extensive knowledge to click on the \"convert\" or \"save as\" button\nwhile using tools for lossy image compression. Via this attack, the adversary\ndoes not need to design a trigger generator as seen in prior works and only\nrequires poisoning the data. Empirically, the proposed attack consistently\nachieves 100% attack success rate in several benchmark datasets such as MNIST,\nCIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still\nachieve almost 100% attack success rate with very small (approximately 10%)\npoisoning rates in the clean label setting. The generated trigger of the\nproposed attack using one lossy compression algorithm is also transferable\nacross other related compression algorithms, exacerbating the severity of this\nbackdoor threat. This work takes another crucial step toward understanding the\nextensive risks of backdoor attacks in practice, urging practitioners to\ninvestigate similar attacks and relevant backdoor mitigation methods.\n","authors":["Sze Jue Yang","Quang Nguyen","Chee Seng Chan","Khoa Doan"],"pdf_url":"https://arxiv.org/pdf/2308.16684v1.pdf","comment":"14 pages. This paper shows everyone can mount a powerful and stealthy\n backdoor attack with the widely-used lossy image compression"},{"id":"http://arxiv.org/abs/2308.16681v1","updated":"2023-08-31T12:32:43Z","published":"2023-08-31T12:32:43Z","title":"Everything, Everywhere All in One Evaluation: Using Multiverse Analysis\n to Evaluate the Influence of Model Design Decisions on Algorithmic Fairness","summary":" A vast number of systems across the world use algorithmic decision making\n(ADM) to (partially) automate decisions that have previously been made by\nhumans. When designed well, these systems promise more objective decisions\nwhile saving large amounts of resources and freeing up human time. However,\nwhen ADM systems are not designed well, they can lead to unfair decisions which\ndiscriminate against societal groups. The downstream effects of ADMs critically\ndepend on the decisions made during the systems' design and implementation, as\nbiases in data can be mitigated or reinforced along the modeling pipeline. Many\nof these design decisions are made implicitly, without knowing exactly how they\nwill influence the final system. It is therefore important to make explicit the\ndecisions made during the design of ADM systems and understand how these\ndecisions affect the fairness of the resulting system.\n To study this issue, we draw on insights from the field of psychology and\nintroduce the method of multiverse analysis for algorithmic fairness. In our\nproposed method, we turn implicit design decisions into explicit ones and\ndemonstrate their fairness implications. By combining decisions, we create a\ngrid of all possible \"universes\" of decision combinations. For each of these\nuniverses, we compute metrics of fairness and performance. Using the resulting\ndataset, one can see how and which decisions impact fairness. We demonstrate\nhow multiverse analyses can be used to better understand variability and\nrobustness of algorithmic fairness using an exemplary case study of predicting\npublic health coverage of vulnerable populations for potential interventions.\nOur results illustrate how decisions during the design of a machine learning\nsystem can have surprising effects on its fairness and how to detect these\neffects using multiverse analysis.\n","authors":["Jan Simson","Florian Pfisterer","Christoph Kern"],"pdf_url":"https://arxiv.org/pdf/2308.16681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16680v1","updated":"2023-08-31T12:32:34Z","published":"2023-08-31T12:32:34Z","title":"Branches of a Tree: Taking Derivatives of Programs with Discrete and\n Branching Randomness in High Energy Physics","summary":" We propose to apply several gradient estimation techniques to enable the\ndifferentiation of programs with discrete randomness in High Energy Physics.\nSuch programs are common in High Energy Physics due to the presence of\nbranching processes and clustering-based analysis. Thus differentiating such\nprograms can open the way for gradient based optimization in the context of\ndetector design optimization, simulator tuning, or data analysis and\nreconstruction optimization. We discuss several possible gradient estimation\nstrategies, including the recent Stochastic AD method, and compare them in\nsimplified detector design experiments. In doing so we develop, to the best of\nour knowledge, the first fully differentiable branching program.\n","authors":["Michael Kagan","Lukas Heinrich"],"pdf_url":"https://arxiv.org/pdf/2308.16680v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.16678v1","updated":"2023-08-31T12:29:24Z","published":"2023-08-31T12:29:24Z","title":"Dynamic nsNet2: Efficient Deep Noise Suppression with Early Exiting","summary":" Although deep learning has made strides in the field of deep noise\nsuppression, leveraging deep architectures on resource-constrained devices\nstill proved challenging. Therefore, we present an early-exiting model based on\nnsNet2 that provides several levels of accuracy and resource savings by halting\ncomputations at different stages. Moreover, we adapt the original architecture\nby splitting the information flow to take into account the injected dynamism.\nWe show the trade-offs between performance and computational complexity based\non established metrics.\n","authors":["Riccardo Miccini","Alaa Zniber","Clément Laroche","Tobias Piechowiak","Martin Schoeberl","Luca Pezzarossa","Ouassim Karrakchou","Jens Sparsø","Mounir Ghogho"],"pdf_url":"https://arxiv.org/pdf/2308.16678v1.pdf","comment":"Accepted at the MLSP 2023"},{"id":"http://arxiv.org/abs/2308.16671v1","updated":"2023-08-31T12:22:40Z","published":"2023-08-31T12:22:40Z","title":"Communication-Efficient Decentralized Federated Learning via One-Bit\n Compressive Sensing","summary":" Decentralized federated learning (DFL) has gained popularity due to its\npracticality across various applications. Compared to the centralized version,\ntraining a shared model among a large number of nodes in DFL is more\nchallenging, as there is no central server to coordinate the training process.\nEspecially when distributed nodes suffer from limitations in communication or\ncomputational resources, DFL will experience extremely inefficient and unstable\ntraining. Motivated by these challenges, in this paper, we develop a novel\nalgorithm based on the framework of the inexact alternating direction method\n(iADM). On one hand, our goal is to train a shared model with a sparsity\nconstraint. This constraint enables us to leverage one-bit compressive sensing\n(1BCS), allowing transmission of one-bit information among neighbour nodes. On\nthe other hand, communication between neighbour nodes occurs only at certain\nsteps, reducing the number of communication rounds. Therefore, the algorithm\nexhibits notable communication efficiency. Additionally, as each node selects\nonly a subset of neighbours to participate in the training, the algorithm is\nrobust against stragglers. Additionally, complex items are computed only once\nfor several consecutive steps and subproblems are solved inexactly using\nclosed-form solutions, resulting in high computational efficiency. Finally,\nnumerical experiments showcase the algorithm's effectiveness in both\ncommunication and computation.\n","authors":["Shenglong Zhou","Kaidi Xu","Geoffrey Ye Li"],"pdf_url":"https://arxiv.org/pdf/2308.16671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.09379v2","updated":"2023-08-31T12:22:15Z","published":"2022-06-19T11:12:30Z","title":"0/1 Deep Neural Networks via Block Coordinate Descent","summary":" The step function is one of the simplest and most natural activation\nfunctions for deep neural networks (DNNs). As it counts 1 for positive\nvariables and 0 for others, its intrinsic characteristics (e.g., discontinuity\nand no viable information of subgradients) impede its development for several\ndecades. Even if there is an impressive body of work on designing DNNs with\ncontinuous activation functions that can be deemed as surrogates of the step\nfunction, it is still in the possession of some advantageous properties, such\nas complete robustness to outliers and being capable of attaining the best\nlearning-theoretic guarantee of predictive accuracy. Hence, in this paper, we\naim to train DNNs with the step function used as an activation function (dubbed\nas 0/1 DNNs). We first reformulate 0/1 DNNs as an unconstrained optimization\nproblem and then solve it by a block coordinate descend (BCD) method. Moreover,\nwe acquire closed-form solutions for sub-problems of BCD as well as its\nconvergence properties. Furthermore, we also integrate\n$\\ell_{2,0}$-regularization into 0/1 DNN to accelerate the training process and\ncompress the network scale. As a result, the proposed algorithm has a high\nperformance on classifying MNIST and Fashion-MNIST datasets. As a result, the\nproposed algorithm has a desirable performance on classifying MNIST,\nFashionMNIST, Cifar10, and Cifar100 datasets.\n","authors":["Hui Zhang","Shenglong Zhou","Geoffrey Ye Li","Naihua Xiu"],"pdf_url":"https://arxiv.org/pdf/2206.09379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16664v1","updated":"2023-08-31T12:12:56Z","published":"2023-08-31T12:12:56Z","title":"What can we learn from quantum convolutional neural networks?","summary":" We can learn from analyzing quantum convolutional neural networks (QCNNs)\nthat: 1) working with quantum data can be perceived as embedding physical\nsystem parameters through a hidden feature map; 2) their high performance for\nquantum phase recognition can be attributed to generation of a very suitable\nbasis set during the ground state embedding, where quantum criticality of spin\nmodels leads to basis functions with rapidly changing features; 3) pooling\nlayers of QCNNs are responsible for picking those basis functions that can\ncontribute to forming a high-performing decision boundary, and the learning\nprocess corresponds to adapting the measurement such that few-qubit operators\nare mapped to full-register observables; 4) generalization of QCNN models\nstrongly depends on the embedding type, and that rotation-based feature maps\nwith the Fourier basis require careful feature engineering; 5) accuracy and\ngeneralization of QCNNs with readout based on a limited number of shots favor\nthe ground state embeddings and associated physics-informed models. We\ndemonstrate these points in simulation, where our results shed light on\nclassification for physical processes, relevant for applications in sensing.\nFinally, we show that QCNNs with properly chosen ground state embeddings can be\nused for fluid dynamics problems, expressing shock wave solutions with good\ngeneralization and proven trainability.\n","authors":["Chukwudubem Umeano","Annie E. Paine","Vincent E. Elfving","Oleksandr Kyriienko"],"pdf_url":"https://arxiv.org/pdf/2308.16664v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.16659v1","updated":"2023-08-31T11:58:13Z","published":"2023-08-31T11:58:13Z","title":"Autoencoder-based Online Data Quality Monitoring for the CMS\n Electromagnetic Calorimeter","summary":" The online Data Quality Monitoring system (DQM) of the CMS electromagnetic\ncalorimeter (ECAL) is a crucial operational tool that allows ECAL experts to\nquickly identify, localize, and diagnose a broad range of detector issues that\nwould otherwise hinder physics-quality data taking. Although the existing ECAL\nDQM system has been continuously updated to respond to new problems, it remains\none step behind newer and unforeseen issues. Using unsupervised deep learning,\na real-time autoencoder-based anomaly detection system is developed that is\nable to detect ECAL anomalies unseen in past data. After accounting for spatial\nvariations in the response of the ECAL and the temporal evolution of anomalies,\nthe new system is able to efficiently detect anomalies while maintaining an\nestimated false discovery rate between $10^{-2}$ to $10^{-4}$, beating existing\nbenchmarks by about two orders of magnitude. The real-world performance of the\nsystem is validated using anomalies found in 2018 and 2022 LHC collision data.\nAdditionally, first results from deploying the autoencoder-based system in the\nCMS online DQM workflow for the ECAL barrel during Run 3 of the LHC are\npresented, showing its promising performance in detecting obscure issues that\ncould have been missed in the existing DQM system.\n","authors":["Abhirami Harilal","Kyungmin Park","Michael Andrews","Manfred Paulini"],"pdf_url":"https://arxiv.org/pdf/2308.16659v1.pdf","comment":"Submitted to the Proceedings of 21st International Workshop on\n Advanced Computing and Analysis Techniques in Physics Research ACAT 2022\n conference"},{"id":"http://arxiv.org/abs/2210.09134v2","updated":"2023-08-31T11:49:17Z","published":"2022-10-17T14:34:42Z","title":"Principled Pruning of Bayesian Neural Networks through Variational Free\n Energy Minimization","summary":" Bayesian model reduction provides an efficient approach for comparing the\nperformance of all nested sub-models of a model, without re-evaluating any of\nthese sub-models. Until now, Bayesian model reduction has been applied mainly\nin the computational neuroscience community on simple models. In this paper, we\nformulate and apply Bayesian model reduction to perform principled pruning of\nBayesian neural networks, based on variational free energy minimization. Direct\napplication of Bayesian model reduction, however, gives rise to approximation\nerrors. Therefore, a novel iterative pruning algorithm is presented to\nalleviate the problems arising with naive Bayesian model reduction, as\nsupported experimentally on the publicly available UCI datasets for different\ninference algorithms. This novel parameter pruning scheme solves the\nshortcomings of current state-of-the-art pruning methods that are used by the\nsignal processing community. The proposed approach has a clear stopping\ncriterion and minimizes the same objective that is used during training. Next\nto these benefits, our experiments indicate better model performance in\ncomparison to state-of-the-art pruning schemes.\n","authors":["Jim Beckers","Bart van Erp","Ziyue Zhao","Kirill Kondrashov","Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2210.09134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16648v1","updated":"2023-08-31T11:44:40Z","published":"2023-08-31T11:44:40Z","title":"Generate Your Own Scotland: Satellite Image Generation Conditioned on\n Maps","summary":" Despite recent advancements in image generation, diffusion models still\nremain largely underexplored in Earth Observation. In this paper we show that\nstate-of-the-art pretrained diffusion models can be conditioned on cartographic\ndata to generate realistic satellite images. We provide two large datasets of\npaired OpenStreetMap images and satellite views over the region of Mainland\nScotland and the Central Belt. We train a ControlNet model and qualitatively\nevaluate the results, demonstrating that both image quality and map fidelity\nare possible. Finally, we provide some insights on the opportunities and\nchallenges of applying these models for remote sensing. Our model weights and\ncode for creating the dataset are publicly available at\nhttps://github.com/miquel-espinosa/map-sat.\n","authors":["Miguel Espinosa","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2308.16648v1.pdf","comment":"13 pages, 6 figures. preprint"},{"id":"http://arxiv.org/abs/2306.05727v2","updated":"2023-08-31T10:54:50Z","published":"2023-06-09T07:48:36Z","title":"The Role of Diverse Replay for Generalisation in Reinforcement Learning","summary":" In reinforcement learning (RL), key components of many algorithms are the\nexploration strategy and replay buffer. These strategies regulate what\nenvironment data is collected and trained on and have been extensively studied\nin the RL literature. In this paper, we investigate the impact of these\ncomponents in the context of generalisation in multi-task RL. We investigate\nthe hypothesis that collecting and training on more diverse data from the\ntraining environments will improve zero-shot generalisation to new tasks. We\nmotivate mathematically and show empirically that generalisation to tasks that\nare \"reachable'' during training is improved by increasing the diversity of\ntransitions in the replay buffer. Furthermore, we show empirically that this\nsame strategy also shows improvement for generalisation to similar but\n\"unreachable'' tasks which could be due to improved generalisation of the\nlearned latent representations.\n","authors":["Max Weltevrede","Matthijs T. J. Spaan","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2306.05727v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2306.05109v2","updated":"2023-08-31T10:13:12Z","published":"2023-06-08T11:16:20Z","title":"Yet Another ICU Benchmark: A Flexible Multi-Center Framework for\n Clinical ML","summary":" Medical applications of machine learning (ML) have experienced a surge in\npopularity in recent years. The intensive care unit (ICU) is a natural habitat\nfor ML given the abundance of available data from electronic health records.\nModels have been proposed to address numerous ICU prediction tasks like the\nearly detection of complications. While authors frequently report\nstate-of-the-art performance, it is challenging to verify claims of\nsuperiority. Datasets and code are not always published, and cohort\ndefinitions, preprocessing pipelines, and training setups are difficult to\nreproduce. This work introduces Yet Another ICU Benchmark (YAIB), a modular\nframework that allows researchers to define reproducible and comparable\nclinical ML experiments; we offer an end-to-end solution from cohort definition\nto model evaluation. The framework natively supports most open-access ICU\ndatasets (MIMIC III/IV, eICU, HiRID, AUMCdb) and is easily adaptable to future\nICU datasets. Combined with a transparent preprocessing pipeline and extensible\ntraining code for multiple ML and deep learning models, YAIB enables unified\nmodel development. Our benchmark comes with five predefined established\nprediction tasks (mortality, acute kidney injury, sepsis, kidney function, and\nlength of stay) developed in collaboration with clinicians. Adding further\ntasks is straightforward by design. Using YAIB, we demonstrate that the choice\nof dataset, cohort definition, and preprocessing have a major impact on the\nprediction performance - often more so than model class - indicating an urgent\nneed for YAIB as a holistic benchmarking tool. We provide our work to the\nclinical ML community to accelerate method development and enable real-world\nclinical implementations. Software Repository:\nhttps://github.com/rvandewater/YAIB.\n","authors":["Robin van de Water","Hendrik Schmidt","Paul Elbers","Patrick Thoral","Bert Arnrich","Patrick Rockenschaub"],"pdf_url":"https://arxiv.org/pdf/2306.05109v2.pdf","comment":"Main benchmark: https://github.com/rvandewater/YAIB, Cohort\n generation: https://github.com/rvandewater/YAIB-cohorts, Models:\n https://github.com/rvandewater/YAIB-models"},{"id":"http://arxiv.org/abs/2308.16609v1","updated":"2023-08-31T10:12:32Z","published":"2023-08-31T10:12:32Z","title":"Towards Long-Tailed Recognition for Graph Classification via\n Collaborative Experts","summary":" Graph classification, aiming at learning the graph-level representations for\neffective class assignments, has received outstanding achievements, which\nheavily relies on high-quality datasets that have balanced class distribution.\nIn fact, most real-world graph data naturally presents a long-tailed form,\nwhere the head classes occupy much more samples than the tail classes, it thus\nis essential to study the graph-level classification over long-tailed data\nwhile still remaining largely unexplored. However, most existing long-tailed\nlearning methods in visions fail to jointly optimize the representation\nlearning and classifier training, as well as neglect the mining of the\nhard-to-classify classes. Directly applying existing methods to graphs may lead\nto sub-optimal performance, since the model trained on graphs would be more\nsensitive to the long-tailed distribution due to the complex topological\ncharacteristics. Hence, in this paper, we propose a novel long-tailed\ngraph-level classification framework via Collaborative Multi-expert Learning\n(CoMe) to tackle the problem. To equilibrate the contributions of head and tail\nclasses, we first develop balanced contrastive learning from the view of\nrepresentation learning, and then design an individual-expert classifier\ntraining based on hard class mining. In addition, we execute gated fusion and\ndisentangled knowledge distillation among the multiple experts to promote the\ncollaboration in a multi-expert framework. Comprehensive experiments are\nperformed on seven widely-used benchmark datasets to demonstrate the\nsuperiority of our method CoMe over state-of-the-art baselines.\n","authors":["Siyu Yi","Zhengyang Mao","Wei Ju","Yongdao Zhou","Luchen Liu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16609v1.pdf","comment":"Accepted by IEEE Transactions on Big Data (TBD 2024)"},{"id":"http://arxiv.org/abs/2308.16599v1","updated":"2023-08-31T09:57:52Z","published":"2023-08-31T09:57:52Z","title":"A Causal Discovery Approach To Learn How Urban Form Shapes Sustainable\n Mobility Across Continents","summary":" Global sustainability requires low-carbon urban transport systems, shaped by\nadequate infrastructure, deployment of low-carbon transport modes and shifts in\ntravel behavior. To adequately implement alterations in infrastructure, it's\nessential to grasp the location-specific cause-and-effect mechanisms that the\nconstructed environment has on travel. Yet, current research falls short in\nrepresenting causal relationships between the 6D urban form variables and\ntravel, generalizing across different regions, and modeling urban form effects\nat high spatial resolution. Here, we address all three gaps by utilizing a\ncausal discovery and an explainable machine learning framework to detect urban\nform effects on intra-city travel based on high-resolution mobility data of six\ncities across three continents. We show that both distance to city center,\ndemographics and density indirectly affect other urban form features. By\nconsidering the causal relationships, we find that location-specific influences\nalign across cities, yet vary in magnitude. In addition, the spread of the city\nand the coverage of jobs across the city are the strongest determinants of\ntravel-related emissions, highlighting the benefits of compact development and\nassociated benefits. Differences in urban form effects across the cities call\nfor a more holistic definition of 6D measures. Our work is a starting point for\nlocation-specific analysis of urban form effects on mobility behavior using\ncausal discovery approaches, which is highly relevant for city planners and\nmunicipalities across continents.\n","authors":["Felix Wagner","Florian Nachtigall","Lukas Franken","Nikola Milojevic-Dupont","Rafael H. M. Pereira","Nicolas Koch","Jakob Runge","Marta Gonzalez","Felix Creutzig"],"pdf_url":"https://arxiv.org/pdf/2308.16599v1.pdf","comment":"22 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.16598v1","updated":"2023-08-31T09:57:27Z","published":"2023-08-31T09:57:27Z","title":"Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation","summary":" Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential\nrole in the early diagnosis and treatment of liver cancer. Deep learning models\nbackboned by fully convolutional neural networks (FCNNs) have become the\ndominant model for segmenting 3D computerized tomography (CT) scans. However,\nsince their convolution layers suffer from limited kernel size, they are not\nable to capture long-range dependencies and global context. To tackle this\nrestriction, vision transformers have been introduced to solve FCNN's locality\nof receptive fields. Although transformers can capture long-range features,\ntheir segmentation performance decreases with various tumor sizes due to the\nmodel sensitivity to the input patch size. While finding an optimal patch size\nimproves the performance of vision transformer-based models on segmentation\ntasks, it is a time-consuming and challenging procedure. This paper proposes a\ntechnique to select the vision transformer's optimal input multi-resolution\nimage patch size based on the average volume size of metastasis lesions. We\nfurther validated our suggested framework using a transfer-learning technique,\ndemonstrating that the highest Dice similarity coefficient (DSC) performance\nwas obtained by pre-training on training data with a larger tumour volume using\nthe suggested ideal patch size and then training with a smaller one. We\nexperimentally evaluate this idea through pre-training our model on a\nmulti-resolution public dataset. Our model showed consistent and improved\nresults when applied to our private multi-resolution mCRC dataset with a\nsmaller average tumor volume. This study lays the groundwork for optimizing\nsemantic segmentation of small objects using vision transformers. The\nimplementation source code is available\nat:https://github.com/Ramtin-Mojtahedi/OVTPS.\n","authors":["Ramtin Mojtahedi","Mohammad Hamghalam","Richard K. G. Do","Amber L. Simpson"],"pdf_url":"https://arxiv.org/pdf/2308.16598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16593v1","updated":"2023-08-31T09:50:33Z","published":"2023-08-31T09:50:33Z","title":"Towards Spontaneous Style Modeling with Semi-supervised Pre-training for\n Conversational Text-to-Speech Synthesis","summary":" The spontaneous behavior that often occurs in conversations makes speech more\nhuman-like compared to reading-style. However, synthesizing spontaneous-style\nspeech is challenging due to the lack of high-quality spontaneous datasets and\nthe high cost of labeling spontaneous behavior. In this paper, we propose a\nsemi-supervised pre-training method to increase the amount of spontaneous-style\nspeech and spontaneous behavioral labels. In the process of semi-supervised\nlearning, both text and speech information are considered for detecting\nspontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is\nused to model the relationship between each sentence in the conversation.\nExperimental results indicate that our proposed method achieves superior\nexpressive speech synthesis performance with the ability to model spontaneous\nbehavior in spontaneous-style speech and predict reasonable spontaneous\nbehavior from text.\n","authors":["Weiqin Li","Shun Lei","Qiaochu Huang","Yixuan Zhou","Zhiyong Wu","Shiyin Kang","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16593v1.pdf","comment":"Accepted by INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2305.08396v4","updated":"2023-08-31T09:43:37Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with a\nnominal memory and computational burden. The inclusion of multi-axis\nself-attention, within each decoder stage, significantly enhances the\ndiscriminating capacity between the object and background regions, thereby\nhelping in improving the segmentation efficiency. In the Hybrid Decoder block,\nthe fusion process commences by integrating the upsampled lower-level decoder\nfeatures, obtained through transpose convolution, with the skip-connection\nfeatures derived from the hybrid encoder. Subsequently, the fused features\nundergo refinement through the utilization of a multi-axis attention mechanism.\nThe proposed decoder block is repeated multiple times to progressively segment\nthe nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset\ndemonstrates the effectiveness of the proposed technique. Our MaxViT-UNet\noutperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet)\ntechniques by a considerable margin on both of the standard datasets. The\nfollowing github (https://github.com/PRLAB21/MaxViT-UNet) contains the\nimplementation and trained weights.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v4.pdf","comment":"17 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.16585v1","updated":"2023-08-31T09:30:06Z","published":"2023-08-31T09:30:06Z","title":"Development and validation of an interpretable machine learning-based\n calculator for predicting 5-year weight trajectories after bariatric surgery:\n a multinational retrospective cohort SOPHIA study","summary":" Background Weight loss trajectories after bariatric surgery vary widely\nbetween individuals, and predicting weight loss before the operation remains\nchallenging. We aimed to develop a model using machine learning to provide\nindividual preoperative prediction of 5-year weight loss trajectories after\nsurgery. Methods In this multinational retrospective observational study we\nenrolled adult participants (aged $\\ge$18 years) from ten prospective cohorts\n(including ABOS [NCT01129297], BAREVAL [NCT02310178], the Swedish Obese\nSubjects study, and a large cohort from the Dutch Obesity Clinic [Nederlandse\nObesitas Kliniek]) and two randomised trials (SleevePass [NCT00793143] and\nSM-BOSS [NCT00356213]) in Europe, the Americas, and Asia, with a 5 year\nfollowup after Roux-en-Y gastric bypass, sleeve gastrectomy, or gastric band.\nPatients with a previous history of bariatric surgery or large delays between\nscheduled and actual visits were excluded. The training cohort comprised\npatients from two centres in France (ABOS and BAREVAL). The primary outcome was\nBMI at 5 years. A model was developed using least absolute shrinkage and\nselection operator to select variables and the classification and regression\ntrees algorithm to build interpretable regression trees. The performances of\nthe model were assessed through the median absolute deviation (MAD) and root\nmean squared error (RMSE) of BMI. Findings10 231 patients from 12 centres in\nten countries were included in the analysis, corresponding to 30 602\npatient-years. Among participants in all 12 cohorts, 7701 (75$\\bullet$3%) were\nfemale, 2530 (24$\\bullet$7%) were male. Among 434 baseline attributes available\nin the training cohort, seven variables were selected: height, weight,\nintervention type, age, diabetes status, diabetes duration, and smoking status.\nAt 5 years, across external testing cohorts the overall mean MAD BMI was\n2$\\bullet$8 kg/m${}^2$ (95% CI 2$\\bullet$6-3$\\bullet$0) and mean RMSE BMI was\n4$\\bullet$7 kg/m${}^2$ (4$\\bullet$4-5$\\bullet$0), and the mean difference\nbetween predicted and observed BMI was-0$\\bullet$3 kg/m${}^2$ (SD 4$\\bullet$7).\nThis model is incorporated in an easy to use and interpretable web-based\nprediction tool to help inform clinical decision before surgery.\nInterpretationWe developed a machine learning-based model, which is\ninternationally validated, for predicting individual 5-year weight loss\ntrajectories after three common bariatric interventions.\n","authors":["Patrick Saux","Pierre Bauvin","Violeta Raverdy","Julien Teigny","Hélène Verkindt","Tomy Soumphonphakdy","Maxence Debert","Anne Jacobs","Daan Jacobs","Valerie Monpellier","Phong Ching Lee","Chin Hong Lim","Johanna C Andersson-Assarsson","Lena Carlsson","Per-Arne Svensson","Florence Galtier","Guelareh Dezfoulian","Mihaela Moldovanu","Severine Andrieux","Julien Couster","Marie Lepage","Erminia Lembo","Ornella Verrastro","Maud Robert","Paulina Salminen","Geltrude Mingrone","Ralph Peterli","Ricardo V Cohen","Carlos Zerrweck","David Nocca","Carel W Le Roux","Robert Caiazzo","Philippe Preux","François Pattou"],"pdf_url":"https://arxiv.org/pdf/2308.16585v1.pdf","comment":"The Lancet Digital Health, 2023"},{"id":"http://arxiv.org/abs/2308.16572v1","updated":"2023-08-31T09:13:30Z","published":"2023-08-31T09:13:30Z","title":"CL-MAE: Curriculum-Learned Masked Autoencoders","summary":" Masked image modeling has been demonstrated as a powerful pretext task for\ngenerating robust representations that can be effectively generalized across\nmultiple downstream tasks. Typically, this approach involves randomly masking\npatches (tokens) in input images, with the masking strategy remaining unchanged\nduring training. In this paper, we propose a curriculum learning approach that\nupdates the masking strategy to continually increase the complexity of the\nself-supervised reconstruction task. We conjecture that, by gradually\nincreasing the task complexity, the model can learn more sophisticated and\ntransferable representations. To facilitate this, we introduce a novel\nlearnable masking module that possesses the capability to generate masks of\ndifferent complexities, and integrate the proposed module into masked\nautoencoders (MAE). Our module is jointly trained with the MAE, while adjusting\nits behavior during training, transitioning from a partner to the MAE\n(optimizing the same reconstruction loss) to an adversary (optimizing the\nopposite loss), while passing through a neutral state. The transition between\nthese behaviors is smooth, being regulated by a factor that is multiplied with\nthe reconstruction loss of the masking module. The resulting training procedure\ngenerates an easy-to-hard curriculum. We train our Curriculum-Learned Masked\nAutoencoder (CL-MAE) on ImageNet and show that it exhibits superior\nrepresentation learning capabilities compared to MAE. The empirical results on\nfive downstream tasks confirm our conjecture, demonstrating that curriculum\nlearning can be successfully used to self-supervise masked autoencoders.\n","authors":["Neelu Madan","Nicolae-Catalin Ristea","Kamal Nasrollahi","Thomas B. Moeslund","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2308.16572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16571v1","updated":"2023-08-31T09:12:34Z","published":"2023-08-31T09:12:34Z","title":"Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based\n Approach","summary":" In the rapidly evolving digital era, the analysis of document layouts plays a\npivotal role in automated information extraction and interpretation. In our\nwork, we have trained MViTv2 transformer model architecture with cascaded mask\nR-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from\na document. After training on 20365 document images for 36 epochs in a 3 phase\ncycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work\nextends beyond training, delving into the exploration of potential enhancement\navenues. We investigate the impact of rotation and flip augmentation, the\neffectiveness of slicing input images pre-inference, the implications of\nvarying the resolution of the transformer backbone, and the potential of\nemploying a dual-pass inference to uncover missed text-boxes. Through these\nexplorations, we observe a spectrum of outcomes, where some modifications\nresult in tangible performance improvements, while others offer unique insights\nfor future endeavors.\n","authors":["Ashrafur Rahman Khan","Asif Azad"],"pdf_url":"https://arxiv.org/pdf/2308.16571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16570v1","updated":"2023-08-31T09:12:30Z","published":"2023-08-31T09:12:30Z","title":"MONDEO: Multistage Botnet Detection","summary":" Mobile devices have widespread to become the most used piece of technology.\nDue to their characteristics, they have become major targets for botnet-related\nmalware. FluBot is one example of botnet malware that infects mobile devices.\nIn particular, FluBot is a DNS-based botnet that uses Domain Generation\nAlgorithms (DGA) to establish communication with the Command and Control Server\n(C2). MONDEO is a multistage mechanism with a flexible design to detect\nDNS-based botnet malware. MONDEO is lightweight and can be deployed without\nrequiring the deployment of software, agents, or configuration in mobile\ndevices, allowing easy integration in core networks. MONDEO comprises four\ndetection stages: Blacklisting/Whitelisting, Query rate analysis, DGA analysis,\nand Machine learning evaluation. It was created with the goal of processing\nstreams of packets to identify attacks with high efficiency, in the distinct\nphases. MONDEO was tested against several datasets to measure its efficiency\nand performance, being able to achieve high performance with RandomForest\nclassifiers. The implementation is available at github.\n","authors":["Duarte Dias","Bruno Sousa","Nuno Antunes"],"pdf_url":"https://arxiv.org/pdf/2308.16570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00262v2","updated":"2023-08-31T09:01:35Z","published":"2023-03-01T06:35:42Z","title":"Collage Diffusion","summary":" We seek to give users precise control over diffusion-based image generation\nby modeling complex scenes as sequences of layers, which define the desired\nspatial arrangement and visual attributes of objects in the scene. Collage\nDiffusion harmonizes the input layers to make objects fit together -- the key\nchallenge involves minimizing changes in the positions and key visual\nattributes of the input layers while allowing other attributes to change in the\nharmonization process. We ensure that objects are generated in the correct\nlocations by modifying text-image cross-attention with the layers' alpha masks.\nWe preserve key visual attributes of input layers by learning specialized text\nrepresentations per layer and by extending ControlNet to operate on layers.\nLayer input allows users to control the extent of image harmonization on a\nper-object basis, and users can even iteratively edit individual objects in\ngenerated images while keeping other objects fixed. By leveraging the rich\ninformation present in layer input, Collage Diffusion generates globally\nharmonized images that maintain desired object characteristics better than\nprior approaches.\n","authors":["Vishnu Sarukkai","Linden Li","Arden Ma","Christopher Ré","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2303.00262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15487v3","updated":"2023-08-31T08:58:17Z","published":"2023-03-27T07:53:43Z","title":"Knowledge Enhanced Graph Neural Networks for Graph Completion","summary":" Graph data is omnipresent and has a wide variety of applications, such as in\nnatural science, social networks, or the semantic web. However, while being\nrich in information, graphs are often noisy and incomplete. As a result, graph\ncompletion tasks, such as node classification or link prediction, have gained\nattention. On one hand, neural methods, such as graph neural networks, have\nproven to be robust tools for learning rich representations of noisy graphs. On\nthe other hand, symbolic methods enable exact reasoning on graphs.We propose\nKnowledge Enhanced Graph Neural Networks (KeGNN), a neuro-symbolic framework\nfor graph completion that combines both paradigms as it allows for the\nintegration of prior knowledge into a graph neural network model.Essentially,\nKeGNN consists of a graph neural network as a base upon which knowledge\nenhancement layers are stacked with the goal of refining predictions with\nrespect to prior knowledge.We instantiate KeGNN in conjunction with two\nstate-of-the-art graph neural networks, Graph Convolutional Networks and Graph\nAttention Networks, and evaluate KeGNN on multiple benchmark datasets for node\nclassification.\n","authors":["Luisa Werner","Nabil Layaïda","Pierre Genevès","Sarah Chlyah"],"pdf_url":"https://arxiv.org/pdf/2303.15487v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05102v2","updated":"2023-08-31T08:43:17Z","published":"2023-03-09T08:21:50Z","title":"StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent\n Disentangled Space","summary":" One major challenge in machine learning applications is coping with\nmismatches between the datasets used in the development and those obtained in\nreal-world applications. These mismatches may lead to inaccurate predictions\nand errors, resulting in poor product quality and unreliable systems. In this\nstudy, we propose StyleDiff to inform developers of the differences between the\ntwo datasets for the steady development of machine learning systems. Using\ndisentangled image spaces obtained from recently proposed generative models,\nStyleDiff compares the two datasets by focusing on attributes in the images and\nprovides an easy-to-understand analysis of the differences between the\ndatasets. The proposed StyleDiff performs in $O (d N\\log N)$, where $N$ is the\nsize of the datasets and $d$ is the number of attributes, enabling the\napplication to large datasets. We demonstrate that StyleDiff accurately detects\ndifferences between datasets and presents them in an understandable format\nusing, for example, driving scenes datasets.\n","authors":["Keisuke Kawano","Takuro Kutsuna","Ryoko Tokuhisa","Akihiro Nakamura","Yasushi Esaki"],"pdf_url":"https://arxiv.org/pdf/2303.05102v2.pdf","comment":"25 pages, 17 figures, Image and Vision Computing"},{"id":"http://arxiv.org/abs/2308.16544v1","updated":"2023-08-31T08:34:20Z","published":"2023-08-31T08:34:20Z","title":"Forecasting Emergency Department Crowding with Advanced Machine Learning\n Models and Multivariable Input","summary":" Emergency department (ED) crowding is a significant threat to patient safety\nand it has been repeatedly associated with increased mortality. Forecasting\nfuture service demand has the potential patient outcomes. Despite active\nresearch on the subject, several gaps remain: 1) proposed forecasting models\nhave become outdated due to quick influx of advanced machine learning models\n(ML), 2) amount of multivariable input data has been limited and 3) discrete\nperformance metrics have been rarely reported. In this study, we document the\nperformance of a set of advanced ML models in forecasting ED occupancy 24 hours\nahead. We use electronic health record data from a large, combined ED with an\nextensive set of explanatory variables, including the availability of beds in\ncatchment area hospitals, traffic data from local observation stations, weather\nvariables, etc. We show that N-BEATS and LightGBM outpeform benchmarks with 11\n% and 9 % respective improvements and that DeepAR predicts next day crowding\nwith an AUC of 0.76 (95 % CI 0.69-0.84). To the best of our knowledge, this is\nthe first study to document the superiority of LightGBM and N-BEATS over\nstatistical benchmarks in the context of ED forecasting.\n","authors":["Jalmari Tuominen","Eetu Pulkkinen","Jaakko Peltonen","Juho Kanniainen","Niku Oksala","Ari Palomäki","Antti Roine"],"pdf_url":"https://arxiv.org/pdf/2308.16544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16541v1","updated":"2023-08-31T08:30:26Z","published":"2023-08-31T08:30:26Z","title":"Scalable Incomplete Multi-View Clustering with Structure Alignment","summary":" The success of existing multi-view clustering (MVC) relies on the assumption\nthat all views are complete. However, samples are usually partially available\ndue to data corruption or sensor malfunction, which raises the research of\nincomplete multi-view clustering (IMVC). Although several anchor-based IMVC\nmethods have been proposed to process the large-scale incomplete data, they\nstill suffer from the following drawbacks: i) Most existing approaches neglect\nthe inter-view discrepancy and enforce cross-view representation to be\nconsistent, which would corrupt the representation capability of the model; ii)\nDue to the samples disparity between different views, the learned anchor might\nbe misaligned, which we referred as the Anchor-Unaligned Problem for Incomplete\ndata (AUP-ID). Such the AUP-ID would cause inaccurate graph fusion and degrades\nclustering performance. To tackle these issues, we propose a novel incomplete\nanchor graph learning framework termed Scalable Incomplete Multi-View\nClustering with Structure Alignment (SIMVC-SA). Specially, we construct the\nview-specific anchor graph to capture the complementary information from\ndifferent views. In order to solve the AUP-ID, we propose a novel structure\nalignment module to refine the cross-view anchor correspondence. Meanwhile, the\nanchor graph construction and alignment are jointly optimized in our unified\nframework to enhance clustering quality. Through anchor graph construction\ninstead of full graphs, the time and space complexity of the proposed SIMVC-SA\nis proven to be linearly correlated with the number of samples. Extensive\nexperiments on seven incomplete benchmark datasets demonstrate the\neffectiveness and efficiency of our proposed method. Our code is publicly\navailable at https://github.com/wy1019/SIMVC-SA.\n","authors":["Yi Wen","Siwei Wang","Ke Liang","Weixuan Liang","Xinhang Wan","Xinwang Liu","Suyuan Liu","Jiyuan Liu","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.16541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16539v1","updated":"2023-08-31T08:30:11Z","published":"2023-08-31T08:30:11Z","title":"On a Connection between Differential Games, Optimal Control, and\n Energy-based Models for Multi-Agent Interactions","summary":" Game theory offers an interpretable mathematical framework for modeling\nmulti-agent interactions. However, its applicability in real-world robotics\napplications is hindered by several challenges, such as unknown agents'\npreferences and goals. To address these challenges, we show a connection\nbetween differential games, optimal control, and energy-based models and\ndemonstrate how existing approaches can be unified under our proposed\nEnergy-based Potential Game formulation. Building upon this formulation, this\nwork introduces a new end-to-end learning application that combines neural\nnetworks for game-parameter inference with a differentiable game-theoretic\noptimization layer, acting as an inductive bias. The experiments using\nsimulated mobile robot pedestrian interactions and real-world automated driving\ndata provide empirical evidence that the game-theoretic layer improves the\npredictive performance of various neural network backbones.\n","authors":["Christopher Diehl","Tobias Klosek","Martin Krüger","Nils Murzyn","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2308.16539v1.pdf","comment":"International Conference on Machine Learning, Workshop on New\n Frontiers in Learning, Control, and Dynamical Systems (ICML 2023\n Frontiers4LCD)"},{"id":"http://arxiv.org/abs/2308.16534v1","updated":"2023-08-31T08:25:47Z","published":"2023-08-31T08:25:47Z","title":"Conditioning Score-Based Generative Models by Neuro-Symbolic Constraints","summary":" Score-based and diffusion models have emerged as effective approaches for\nboth conditional and unconditional generation. Still conditional generation is\nbased on either a specific training of a conditional model or classifier\nguidance, which requires training a noise-dependent classifier, even when the\nclassifier for uncorrupted data is given. We propose an approach to sample from\nunconditional score-based generative models enforcing arbitrary logical\nconstraints, without any additional training. Firstly, we show how to\nmanipulate the learned score in order to sample from an un-normalized\ndistribution conditional on a user-defined constraint. Then, we define a\nflexible and numerically stable neuro-symbolic framework for encoding soft\nlogical constraints. Combining these two ingredients we obtain a general, but\napproximate, conditional sampling algorithm. We further developed effective\nheuristics aimed at improving the approximation. Finally, we show the\neffectiveness of our approach for various types of constraints and data:\ntabular data, images and time series.\n","authors":["Davide Scassola","Sebastiano Saccani","Ginevra Carbone","Luca Bortolussi"],"pdf_url":"https://arxiv.org/pdf/2308.16534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16528v1","updated":"2023-08-31T08:19:26Z","published":"2023-08-31T08:19:26Z","title":"SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded\n Objects","summary":" To enable meaningful robotic manipulation of objects in the real-world, 6D\npose estimation is one of the critical aspects. Most existing approaches have\ndifficulties to extend predictions to scenarios where novel object instances\nare continuously introduced, especially with heavy occlusions. In this work, we\npropose a few-shot pose estimation (FSPE) approach called SA6D, which uses a\nself-adaptive segmentation module to identify the novel target object and\nconstruct a point cloud model of the target object using only a small number of\ncluttered reference images. Unlike existing methods, SA6D does not require\nobject-centric reference images or any additional object information, making it\na more generalizable and scalable solution across categories. We evaluate SA6D\non real-world tabletop object datasets and demonstrate that SA6D outperforms\nexisting FSPE methods, particularly in cluttered scenes with occlusions, while\nrequiring fewer reference images.\n","authors":["Ning Gao","Ngo Anh Vien","Hanna Ziesche","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08566v2","updated":"2023-08-31T08:17:57Z","published":"2023-03-15T12:34:24Z","title":"Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning","summary":" Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful\nalternative for full fine-tuning so as to adapt pre-trained vision models to\ndownstream tasks, which only tunes a small number of parameters while freezing\nthe vast majority ones to ease storage burden and optimization difficulty.\nHowever, existing PEFT methods introduce trainable parameters to the same\npositions across different tasks depending solely on human heuristics and\nneglect the domain gaps. To this end, we study where to introduce and how to\nallocate trainable parameters by proposing a novel Sensitivity-aware visual\nParameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates\ntrainable parameters to task-specific important positions given a desired\ntunable parameter budget. Specifically, our SPT first quickly identifies the\nsensitive parameters that require tuning for a given task in a data-dependent\nway. Next, our SPT further boosts the representational capability for the\nweight matrices whose number of sensitive parameters exceeds a pre-defined\nthreshold by utilizing existing structured tuning methods, e.g., LoRA [23] or\nAdapter [22], to replace directly tuning the selected sensitive parameters\n(unstructured tuning) under the budget. Extensive experiments on a wide range\nof downstream recognition tasks show that our SPT is complementary to the\nexisting PEFT methods and largely boosts their performance, e.g., SPT improves\nAdapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean\nTop-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks,\nrespectively. Source code is at https://github.com/ziplab/SPT\n","authors":["Haoyu He","Jianfei Cai","Jing Zhang","Dacheng Tao","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2303.08566v2.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2305.19979v2","updated":"2023-08-31T08:02:35Z","published":"2023-05-31T16:04:25Z","title":"Knowledge Graph Embeddings in the Biomedical Domain: Are They Useful? A\n Look at Link Prediction, Rule Learning, and Downstream Polypharmacy Tasks","summary":" Knowledge graphs are powerful tools for representing and organising complex\nbiomedical data. Several knowledge graph embedding algorithms have been\nproposed to learn from and complete knowledge graphs. However, a recent study\ndemonstrates the limited efficacy of these embedding algorithms when applied to\nbiomedical knowledge graphs, raising the question of whether knowledge graph\nembeddings have limitations in biomedical settings. This study aims to apply\nstate-of-the-art knowledge graph embedding models in the context of a recent\nbiomedical knowledge graph, BioKG, and evaluate their performance and potential\ndownstream uses. We achieve a three-fold improvement in terms of performance\nbased on the HITS@10 score over previous work on the same biomedical knowledge\ngraph. Additionally, we provide interpretable predictions through a rule-based\nmethod. We demonstrate that knowledge graph embedding models are applicable in\npractice by evaluating the best-performing model on four tasks that represent\nreal-life polypharmacy situations. Results suggest that knowledge learnt from\nlarge biomedical knowledge graphs can be transferred to such downstream use\ncases. Our code is available at https://github.com/aryopg/biokge.\n","authors":["Aryo Pradipta Gema","Dominik Grabarczyk","Wolf De Wulf","Piyush Borole","Javier Antonio Alfaro","Pasquale Minervini","Antonio Vergari","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2305.19979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16516v1","updated":"2023-08-31T08:00:08Z","published":"2023-08-31T08:00:08Z","title":"Curvature-based Pooling within Graph Neural Networks","summary":" Over-squashing and over-smoothing are two critical issues, that limit the\ncapabilities of graph neural networks (GNNs). While over-smoothing eliminates\nthe differences between nodes making them indistinguishable, over-squashing\nrefers to the inability of GNNs to propagate information over long distances,\nas exponentially many node states are squashed into fixed-size representations.\nBoth phenomena share similar causes, as both are largely induced by the graph\ntopology. To mitigate these problems in graph classification tasks, we propose\nCurvPool, a novel pooling method. CurvPool exploits the notion of curvature of\na graph to adaptively identify structures responsible for both over-smoothing\nand over-squashing. By clustering nodes based on the Balanced Forman curvature,\nCurvPool constructs a graph with a more suitable structure, allowing deeper\nmodels and the combination of distant information. We compare it to other\nstate-of-the-art pooling approaches and establish its competitiveness in terms\nof classification accuracy, computational complexity, and flexibility. CurvPool\noutperforms several comparable methods across all considered tasks. The most\nconsistent results are achieved by pooling densely connected clusters using the\nsum aggregation, as this allows additional information about the size of each\npool.\n","authors":["Cedric Sanders","Andreas Roth","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2308.16516v1.pdf","comment":"ECMLPKDD 2023 - Workshop on Mining and Learning with Graphs"},{"id":"http://arxiv.org/abs/2308.16139v2","updated":"2023-08-31T07:26:50Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Arcot Sowmya","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2305.15777v2","updated":"2023-08-31T07:20:34Z","published":"2023-05-25T06:44:43Z","title":"Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation","summary":" Medical image data are often limited due to the expensive acquisition and\nannotation process. Hence, training a deep-learning model with only raw data\ncan easily lead to overfitting. One solution to this problem is to augment the\nraw data with various transformations, improving the model's ability to\ngeneralize to new data. However, manually configuring a generic augmentation\ncombination and parameters for different datasets is non-trivial due to\ninconsistent acquisition approaches and data distributions. Therefore,\nautomatic data augmentation is proposed to learn favorable augmentation\nstrategies for different datasets while incurring large GPU overhead. To this\nend, we present a novel method, called Dynamic Data Augmentation (DDAug), which\nis efficient and has negligible computation cost. Our DDAug develops a\nhierarchical tree structure to represent various augmentations and utilizes an\nefficient Monte-Carlo tree searching algorithm to update, prune, and sample the\ntree. As a result, the augmentation pipeline can be optimized for each dataset\nautomatically. Experiments on multiple Prostate MRI datasets show that our\nmethod outperforms the current state-of-the-art data augmentation strategies.\n","authors":["Xinyue Xu","Yuhan Hsi","Haonan Wang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2305.15777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06777v3","updated":"2023-08-31T07:03:19Z","published":"2023-06-11T21:14:29Z","title":"Improving the Validity of Decision Trees as Explanations","summary":" In classification and forecasting with tabular data, one often utilizes\ntree-based models. Those can be competitive with deep neural networks on\ntabular data [cf. Grinsztajn et al., NeurIPS 2022, arXiv:2207.08815] and, under\nsome conditions, explainable. The explainability depends on the depth of the\ntree and the accuracy in each leaf of the tree. Decision trees containing\nleaves with unbalanced accuracy can provide misleading explanations.\nLow-accuracy leaves give less valid explanations, which could be interpreted as\nunfairness among explanations. Here, we train a shallow tree with the objective\nof minimizing the maximum misclassification error across each leaf node. Then,\nwe extend each leaf with a separate tree-based model. The shallow tree provides\na global explanation, while the overall statistical performance of the shallow\ntree with extended leaves improves upon decision trees of unlimited depth\ntrained using classical methods (e.g., CART) and is comparable to\nstate-of-the-art methods (e.g., well-tuned XGBoost).\n","authors":["Jiri Nemecek","Tomas Pevny","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2306.06777v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16491v1","updated":"2023-08-31T06:53:22Z","published":"2023-08-31T06:53:22Z","title":"In-class Data Analysis Replications: Teaching Students while Testing\n Science","summary":" Science is facing a reproducibility crisis. Previous work has proposed\nincorporating data analysis replications into classrooms as a potential\nsolution. However, despite the potential benefits, it is unclear whether this\napproach is feasible, and if so, what the involved stakeholders-students,\neducators, and scientists-should expect from it. Can students perform a data\nanalysis replication over the course of a class? What are the costs and\nbenefits for educators? And how can this solution help benchmark and improve\nthe state of science?\n In the present study, we incorporated data analysis replications in the\nproject component of the Applied Data Analysis course (CS-401) taught at EPFL\n(N=354 students). Here we report pre-registered findings based on surveys\nadministered throughout the course. First, we demonstrate that students can\nreplicate previously published scientific papers, most of them qualitatively\nand some exactly. We find discrepancies between what students expect of data\nanalysis replications and what they experience by doing them along with changes\nin expectations about reproducibility, which together serve as evidence of\nattitude shifts to foster students' critical thinking. Second, we provide\ninformation for educators about how much overhead is needed to incorporate\nreplications into the classroom and identify concerns that replications bring\nas compared to more traditional assignments. Third, we identify tangible\nbenefits of the in-class data analysis replications for scientific communities,\nsuch as a collection of replication reports and insights about replication\nbarriers in scientific work that should be avoided going forward.\n Overall, we demonstrate that incorporating replication tasks into a large\ndata science class can increase the reproducibility of scientific work as a\nby-product of data science instruction, thus benefiting both science and\nstudents.\n","authors":["Kristina Gligoric","Tiziano Piccardi","Jake Hofman","Robert West"],"pdf_url":"https://arxiv.org/pdf/2308.16491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16490v1","updated":"2023-08-31T06:52:43Z","published":"2023-08-31T06:52:43Z","title":"Latent Painter","summary":" Latent diffusers revolutionized the generative AI and inspired creative art.\nWhen denoising the latent, the predicted original image at each step\ncollectively animates the formation. However, the animation is limited by the\ndenoising nature of the diffuser, and only renders a sharpening process. This\nwork presents Latent Painter, which uses the latent as the canvas, and the\ndiffuser predictions as the plan, to generate painting animation. Latent\nPainter also transits one generated image to another, which can happen between\nimages from two different sets of checkpoints.\n","authors":["Shih-Chieh Su"],"pdf_url":"https://arxiv.org/pdf/2308.16490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11594v2","updated":"2023-08-31T06:48:29Z","published":"2023-08-20T05:03:31Z","title":"Quantization-based Optimization with Perspective of Quantum Mechanics","summary":" Statistical and stochastic analysis based on thermodynamics has been the main\nanalysis framework for stochastic global optimization. Recently, appearing\nquantum annealing or quantum tunneling algorithm for global optimization, we\nrequire a new researching framework for global optimization algorithms. In this\npaper, we provide the analysis for quantization-based optimization based on the\nSchr\\\"odinger equation to reveal what property in quantum mechanics enables\nglobal optimization. We present that the tunneling effect derived by the\nSchr\\\"odinger equation in quantization-based optimization enables to escape of\na local minimum. Additionally, we confirm that this tunneling effect is the\nsame property included in quantum mechanics-based global optimization.\nExperiments with standard multi-modal benchmark functions represent that the\nproposed analysis is valid.\n","authors":["Jinwuk Seok","Changsik Cho"],"pdf_url":"https://arxiv.org/pdf/2308.11594v2.pdf","comment":"Preprint for ICTC conference (First Revision)"},{"id":"http://arxiv.org/abs/2308.16484v1","updated":"2023-08-31T06:44:59Z","published":"2023-08-31T06:44:59Z","title":"Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning","summary":" Affordable 3D scanners often produce sparse and non-uniform point clouds that\nnegatively impact downstream applications in robotic systems. While existing\npoint cloud upsampling architectures have demonstrated promising results on\nstandard benchmarks, they tend to experience significant performance drops when\nthe test data have different distributions from the training data. To address\nthis issue, this paper proposes a test-time adaption approach to enhance model\ngenerality of point cloud upsampling. The proposed approach leverages\nmeta-learning to explicitly learn network parameters for test-time adaption.\nOur method does not require any prior information about the test data. During\nmeta-training, the model parameters are learned from a collection of\ninstance-level tasks, each of which consists of a sparse-dense pair of point\nclouds from the training data. During meta-testing, the trained model is\nfine-tuned with a few gradient updates to produce a unique set of network\nparameters for each test instance. The updated model is then used for the final\nprediction. Our framework is generic and can be applied in a plug-and-play\nmanner with existing backbone networks in point cloud upsampling. Extensive\nexperiments demonstrate that our approach improves the performance of\nstate-of-the-art models.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16483v1","updated":"2023-08-31T06:44:42Z","published":"2023-08-31T06:44:42Z","title":"Echocardiographic View Classification with Integrated\n Out-of-Distribution Detection for Enhanced Automatic Echocardiographic\n Analysis","summary":" In the rapidly evolving field of automatic echocardiographic analysis and\ninterpretation, automatic view classification is a critical yet challenging\ntask, owing to the inherent complexity and variability of echocardiographic\ndata. This study presents ECHOcardiography VIew Classification with\nOut-of-Distribution dEtection (ECHO-VICODE), a novel deep learning-based\nframework that effectively addresses this challenge by training to classify 31\nclasses, surpassing previous studies and demonstrating its capacity to handle a\nwide range of echocardiographic views. Furthermore, ECHO-VICODE incorporates an\nintegrated out-of-distribution (OOD) detection function, leveraging the\nrelative Mahalanobis distance to effectively identify 'near-OOD' instances\ncommonly encountered in echocardiographic data. Through extensive\nexperimentation, we demonstrated the outstanding performance of ECHO-VICODE in\nterms of view classification and OOD detection, significantly reducing the\npotential for errors in echocardiographic analyses. This pioneering study\nsignificantly advances the domain of automated echocardiography analysis and\nexhibits promising prospects for substantial applications in extensive clinical\nresearch and practice.\n","authors":["Jaeik Jeon","Seongmin Ha","Yeonyee E. Yoon","Jiyeon Kim","Hyunseok Jeong","Dawun Jeong","Yeonggul Jang","Youngtaek Hong","Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2308.16483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06620v2","updated":"2023-08-31T06:35:36Z","published":"2023-07-13T08:36:15Z","title":"Online Distributed Learning with Quantized Finite-Time Coordination","summary":" In this paper we consider online distributed learning problems. Online\ndistributed learning refers to the process of training learning models on\ndistributed data sources. In our setting a set of agents need to cooperatively\ntrain a learning model from streaming data. Differently from federated\nlearning, the proposed approach does not rely on a central server but only on\npeer-to-peer communications among the agents. This approach is often used in\nscenarios where data cannot be moved to a centralized location due to privacy,\nsecurity, or cost reasons. In order to overcome the absence of a central\nserver, we propose a distributed algorithm that relies on a quantized,\nfinite-time coordination protocol to aggregate the locally trained models.\nFurthermore, our algorithm allows for the use of stochastic gradients during\nlocal training. Stochastic gradients are computed using a randomly sampled\nsubset of the local training data, which makes the proposed algorithm more\nefficient and scalable than traditional gradient descent. In our paper, we\nanalyze the performance of the proposed algorithm in terms of the mean distance\nfrom the online solution. Finally, we present numerical results for a logistic\nregression task.\n","authors":["Nicola Bastianello","Apostolos I. Rikos","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2307.06620v2.pdf","comment":"To be presented at IEEE CDC'23"},{"id":"http://arxiv.org/abs/2308.16481v1","updated":"2023-08-31T06:32:11Z","published":"2023-08-31T06:32:11Z","title":"Point-TTA: Test-Time Adaptation for Point Cloud Registration Using\n Multitask Meta-Auxiliary Learning","summary":" We present Point-TTA, a novel test-time adaptation framework for point cloud\nregistration (PCR) that improves the generalization and the performance of\nregistration models. While learning-based approaches have achieved impressive\nprogress, generalization to unknown testing environments remains a major\nchallenge due to the variations in 3D scans. Existing methods typically train a\ngeneric model and the same trained model is applied on each instance during\ntesting. This could be sub-optimal since it is difficult for the same model to\nhandle all the variations during testing. In this paper, we propose a test-time\nadaptation approach for PCR. Our model can adapt to unseen distributions at\ntest-time without requiring any prior knowledge of the test data. Concretely,\nwe design three self-supervised auxiliary tasks that are optimized jointly with\nthe primary PCR task. Given a test instance, we adapt our model using these\nauxiliary tasks and the updated model is used to perform the inference. During\ntraining, our model is trained using a meta-auxiliary learning approach, such\nthat the adapted model via auxiliary tasks improves the accuracy of the primary\ntask. Experimental results demonstrate the effectiveness of our approach in\nimproving generalization of point cloud registration and outperforming other\nstate-of-the-art approaches.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11656v4","updated":"2023-08-31T06:11:13Z","published":"2022-11-21T17:15:46Z","title":"Sequential Informed Federated Unlearning: Efficient and Provable Client\n Unlearning in Federated Optimization","summary":" The aim of Machine Unlearning (MU) is to provide theoretical guarantees on\nthe removal of the contribution of a given data point from a training\nprocedure. Federated Unlearning (FU) consists in extending MU to unlearn a\ngiven client's contribution from a federated training routine. Current FU\napproaches are generally not scalable, and do not come with sound theoretical\nquantification of the effectiveness of unlearning. In this work we present\nInformed Federated Unlearning (IFU), a novel efficient and quantifiable FU\napproach. Upon unlearning request from a given client, IFU identifies the\noptimal FL iteration from which FL has to be reinitialized, with unlearning\nguarantees obtained through a randomized perturbation mechanism. The theory of\nIFU is also extended to account for sequential unlearning requests.\nExperimental results on different tasks and dataset show that IFU leads to more\nefficient unlearning procedures as compared to basic re-training and\nstate-of-the-art FU approaches.\n","authors":["Yann Fraboni","Martin Van Waerebeke","Kevin Scaman","Richard Vidal","Laetitia Kameni","Marco Lorenzi"],"pdf_url":"https://arxiv.org/pdf/2211.11656v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16471v1","updated":"2023-08-31T05:26:14Z","published":"2023-08-31T05:26:14Z","title":"A Policy Adaptation Method for Implicit Multitask Reinforcement Learning\n Problems","summary":" In dynamic motion generation tasks, including contact and collisions, small\nchanges in policy parameters can lead to extremely different returns. For\nexample, in soccer, the ball can fly in completely different directions with a\nsimilar heading motion by slightly changing the hitting position or the force\napplied to the ball or when the friction of the ball varies. However, it is\ndifficult to imagine that completely different skills are needed for heading a\nball in different directions. In this study, we proposed a multitask\nreinforcement learning algorithm for adapting a policy to implicit changes in\ngoals or environments in a single motion category with different reward\nfunctions or physical parameters of the environment. We evaluated the proposed\nmethod on the ball heading task using a monopod robot model. The results showed\nthat the proposed method can adapt to implicit changes in the goal positions or\nthe coefficients of restitution of the ball, whereas the standard domain\nrandomization approach cannot cope with different task settings.\n","authors":["Satoshi Yamamori","Jun Morimoto"],"pdf_url":"https://arxiv.org/pdf/2308.16471v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.16470v1","updated":"2023-08-31T05:26:08Z","published":"2023-08-31T05:26:08Z","title":"Domain-adaptive Message Passing Graph Neural Network","summary":" Cross-network node classification (CNNC), which aims to classify nodes in a\nlabel-deficient target network by transferring the knowledge from a source\nnetwork with abundant labels, draws increasing attention recently. To address\nCNNC, we propose a domain-adaptive message passing graph neural network\n(DM-GNN), which integrates graph neural network (GNN) with conditional\nadversarial domain adaptation. DM-GNN is capable of learning informative\nrepresentations for node classification that are also transferrable across\nnetworks. Firstly, a GNN encoder is constructed by dual feature extractors to\nseparate ego-embedding learning from neighbor-embedding learning so as to\njointly capture commonality and discrimination between connected nodes.\nSecondly, a label propagation node classifier is proposed to refine each node's\nlabel prediction by combining its own prediction and its neighbors' prediction.\nIn addition, a label-aware propagation scheme is devised for the labeled source\nnetwork to promote intra-class propagation while avoiding inter-class\npropagation, thus yielding label-discriminative source embeddings. Thirdly,\nconditional adversarial domain adaptation is performed to take the\nneighborhood-refined class-label information into account during adversarial\ndomain adaptation, so that the class-conditional distributions across networks\ncan be better matched. Comparisons with eleven state-of-the-art methods\ndemonstrate the effectiveness of the proposed DM-GNN.\n","authors":["Xiao Shen","Shirui Pan","Kup-Sze Choi","Xi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16468v1","updated":"2023-08-31T05:22:51Z","published":"2023-08-31T05:22:51Z","title":"Computing excited states of molecules using normalizing flows","summary":" We present a new nonlinear variational framework for simultaneously computing\nground and excited states of quantum systems. Our approach is based on\napproximating wavefunctions in the linear span of basis functions that are\naugmented and optimized \\emph{via} composition with normalizing flows. The\naccuracy and efficiency of our approach are demonstrated in the calculations of\na large number of vibrational states of the triatomic H$_2$S molecule as well\nas ground and several excited electronic states of prototypical one-electron\nsystems including the hydrogen atom, the molecular hydrogen ion, and a carbon\natom in a single-active-electron approximation. The results demonstrate\nsignificant improvements in the accuracy of energy predictions and accelerated\nbasis-set convergence even when using normalizing flows with a small number of\nparameters. The present approach can be also seen as the optimization of a set\nof intrinsic coordinates that best capture the underlying physics within the\ngiven basis set.\n","authors":["Yahya Saleh","Álvaro Fernández Corral","Armin Iske","Jochen Küpper","Andrey Yachmenev"],"pdf_url":"https://arxiv.org/pdf/2308.16468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07864v2","updated":"2023-08-31T05:11:10Z","published":"2022-11-15T03:10:05Z","title":"Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning","summary":" Federated learning (FL) enables multiple clients to collaboratively train a\nglobal model without disclosing their data. Previous researches often require\ntraining the complete model parameters. However, the emergence of powerful\npre-trained models makes it possible to achieve higher performance with fewer\nlearnable parameters in FL. In this paper, we propose a federated adaptive\nprompt tuning algorithm, FedAPT, for multi-domain collaborative image\nclassification with powerful foundation models, like CLIP. Compared with direct\nfederated prompt tuning, our core idea is to adaptively unlock specific domain\nknowledge for each test sample in order to provide them with personalized\nprompts. To implement this idea, we design an adaptive prompt tuning module,\nwhich consists of a meta prompt, an adaptive network, and some keys. The server\nrandomly generates a set of keys and assigns a unique key to each client. Then\nall clients cooperatively train the global adaptive network and meta prompt\nwith the local datasets and the frozen keys. Ultimately, the global aggregation\nmodel can assign a personalized prompt to CLIP based on the domain features of\neach test sample. We perform extensive experiments on two multi-domain image\nclassification datasets across two different settings - supervised and\nunsupervised. The results show that FedAPT can achieve better performance with\nless than 10\\% of the number of parameters of the fully trained model, and the\nglobal model can perform well in diverse client domains simultaneously.\n","authors":["Shangchao Su","Mingzhao Yang","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2211.07864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16458v1","updated":"2023-08-31T04:52:58Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained language models like ChatGPT have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks. Moreover, in bioinformatics, generating\nfunctional programs poses additional notable challenges due to the amount of\ndomain knowledge, the need for complicated data operations, and intricate\nfunctional dependencies between the operations. Here, we present BioCoder, a\nbenchmark developed to evaluate existing pre-trained models in generating\nbioinformatics code. In relation to function-code generation, BioCoder covers\npotential package dependencies, class declarations, and global variables. It\nincorporates 1026 functions and 1243 methods in Python and Java from GitHub and\n253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing\nframework for evaluation, and we have applied it to evaluate many models\nincluding InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+,\nInstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes\nthe importance of domain knowledge, pragmatic code generation, and contextual\nunderstanding. Our dataset, benchmark, Docker images, and scripts required for\ntesting are all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16456v1","updated":"2023-08-31T04:48:59Z","published":"2023-08-31T04:48:59Z","title":"Least Squares Maximum and Weighted Generalization-Memorization Machines","summary":" In this paper, we propose a new way of remembering by introducing a memory\ninfluence mechanism for the least squares support vector machine (LSSVM).\nWithout changing the equation constraints of the original LSSVM, this\nmechanism, allows an accurate partitioning of the training set without\noverfitting. The maximum memory impact model (MIMM) and the weighted impact\nmemory model (WIMM) are then proposed. It is demonstrated that these models can\nbe degraded to the LSSVM. Furthermore, we propose some different memory impact\nfunctions for the MIMM and WIMM. The experimental results show that that our\nMIMM and WIMM have better generalization performance compared to the LSSVM and\nsignificant advantage in time cost compared to other memory models.\n","authors":["Shuai Wang","Zhen Wang","Yuan-Hai Shao"],"pdf_url":"https://arxiv.org/pdf/2308.16456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16454v1","updated":"2023-08-31T04:46:12Z","published":"2023-08-31T04:46:12Z","title":"Adversarial Finetuning with Latent Representation Constraint to Mitigate\n Accuracy-Robustness Tradeoff","summary":" This paper addresses the tradeoff between standard accuracy on clean examples\nand robustness against adversarial examples in deep neural networks (DNNs).\nAlthough adversarial training (AT) improves robustness, it degrades the\nstandard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we\npropose a novel AT method called ARREST, which comprises three components: (i)\nadversarial finetuning (AFT), (ii) representation-guided knowledge distillation\n(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples\nby initializing its parameters with a DNN that is standardly pretrained on\nclean examples. RGKD and NR respectively entail a regularization term and an\nalgorithm to preserve latent representations of clean examples during AFT. RGKD\npenalizes the distance between the representations of the standardly pretrained\nand AFT DNNs. NR switches input adversarial examples to nonadversarial ones\nwhen the representation changes significantly during AFT. By combining these\ncomponents, ARREST achieves both high standard accuracy and robustness.\nExperimental results demonstrate that ARREST mitigates the tradeoff more\neffectively than previous AT-based methods do.\n","authors":["Satoshi Suzuki","Shin'ya Yamaguchi","Shoichiro Takeda","Sekitoshi Kanai","Naoki Makishima","Atsushi Ando","Ryo Masumura"],"pdf_url":"https://arxiv.org/pdf/2308.16454v1.pdf","comment":"Accepted by International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.16453v1","updated":"2023-08-31T04:45:44Z","published":"2023-08-31T04:45:44Z","title":"Listen to Minority: Encrypted Traffic Classification for Class Imbalance\n with Contrastive Pre-Training","summary":" Mobile Internet has profoundly reshaped modern lifestyles in various aspects.\nEncrypted Traffic Classification (ETC) naturally plays a crucial role in\nmanaging mobile Internet, especially with the explosive growth of mobile apps\nusing encrypted communication. Despite some existing learning-based ETC methods\nshowing promising results, three-fold limitations still remain in real-world\nnetwork environments, 1) label bias caused by traffic class imbalance, 2)\ntraffic homogeneity caused by component sharing, and 3) training with reliance\non sufficient labeled traffic. None of the existing ETC methods can address all\nthese limitations. In this paper, we propose a novel Pre-trAining\nSemi-Supervised ETC framework, dubbed PASS. Our key insight is to resample the\noriginal train dataset and perform contrastive pre-training without using\nindividual app labels directly to avoid label bias issues caused by class\nimbalance, while obtaining a robust feature representation to differentiate\noverlapping homogeneous traffic by pulling positive traffic pairs closer and\npushing negative pairs away. Meanwhile, PASS designs a semi-supervised\noptimization strategy based on pseudo-label iteration and dynamic loss\nweighting algorithms in order to effectively utilize massive unlabeled traffic\ndata and alleviate manual train dataset annotation workload. PASS outperforms\nstate-of-the-art ETC methods and generic sampling approaches on four public\ndatasets with significant class imbalance and traffic homogeneity, remarkably\npushing the F1 of Cross-Platform215 with 1.31%, ISCX-17 with 9.12%.\nFurthermore, we validate the generality of the contrastive pre-training and\npseudo-label iteration components of PASS, which can adaptively benefit ETC\nmethods with diverse feature extractors.\n","authors":["Xiang Li","Juncheng Guo","Qige Song","Jiang Xie","Yafei Sang","Shuyuan Zhao","Yongzheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16453v1.pdf","comment":"Accepted by 2023 IEEE SECON, 9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.11029v2","updated":"2023-08-31T04:36:30Z","published":"2023-08-18T11:29:12Z","title":"RBA-GCN: Relational Bilevel Aggregation Graph Convolutional Network for\n Emotion Recognition","summary":" Emotion recognition in conversation (ERC) has received increasing attention\nfrom researchers due to its wide range of applications.As conversation has a\nnatural graph structure,numerous approaches used to model ERC based on graph\nconvolutional networks (GCNs) have yielded significant results.However,the\naggregation approach of traditional GCNs suffers from the node information\nredundancy problem,leading to node discriminant information\nloss.Additionally,single-layer GCNs lack the capacity to capture long-range\ncontextual information from the graph. Furthermore,the majority of approaches\nare based on textual modality or stitching together different modalities,\nresulting in a weak ability to capture interactions between modalities. To\naddress these problems, we present the relational bilevel aggregation graph\nconvolutional network (RBA-GCN), which consists of three modules: the graph\ngeneration module (GGM), similarity-based cluster building module (SCBM) and\nbilevel aggregation module (BiAM). First, GGM constructs a novel graph to\nreduce the redundancy of target node information.Then,SCBM calculates the node\nsimilarity in the target node and its structural neighborhood, where noisy\ninformation with low similarity is filtered out to preserve the discriminant\ninformation of the node. Meanwhile, BiAM is a novel aggregation method that can\npreserve the information of nodes during the aggregation process. This module\ncan construct the interaction between different modalities and capture\nlong-range contextual information based on similarity clusters. On both the\nIEMOCAP and MELD datasets, the weighted average F1 score of RBA-GCN has a\n2.17$\\sim$5.21\\% improvement over that of the most advanced method.Our code is\navailable at https://github.com/luftmenscher/RBA-GCN and our article with the\nsame name has been published in IEEE/ACM Transactions on Audio,Speech,and\nLanguage Processing,vol.31,2023\n","authors":["Lin Yuan","Guoheng Huang","Fenghuan Li","Xiaochen Yuan","Chi-Man Pun","Guo Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.11029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16437v1","updated":"2023-08-31T03:52:57Z","published":"2023-08-31T03:52:57Z","title":"AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR\n Prediction","summary":" Click-through rate (CTR) prediction is a crucial issue in recommendation\nsystems. There has been an emergence of various public CTR datasets. However,\nexisting datasets primarily suffer from the following limitations. Firstly,\nusers generally click different types of items from multiple scenarios, and\nmodeling from multiple scenarios can provide a more comprehensive understanding\nof users. Existing datasets only include data for the same type of items from a\nsingle scenario. Secondly, multi-modal features are essential in multi-scenario\nprediction as they address the issue of inconsistent ID encoding between\ndifferent scenarios. The existing datasets are based on ID features and lack\nmulti-modal features. Third, a large-scale dataset can provide a more reliable\nevaluation of models, fully reflecting the performance differences between\nmodels. The scale of existing datasets is around 100 million, which is\nrelatively small compared to the real-world CTR prediction. To address these\nlimitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset\nbased on industrial data from Alipay. Specifically, AntM$^{2}$C provides the\nfollowing advantages: 1) It covers CTR data of 5 different types of items,\nproviding insights into the preferences of users for different items, including\nadvertisements, vouchers, mini-programs, contents, and videos. 2) Apart from\nID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text\nand image features, which can effectively establish connections between items\nwith different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200\nfeatures, including 200 million users and 6 million items. It is currently the\nlargest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several\ntypical CTR tasks and provide comparisons with baseline methods. The dataset\nhomepage is available at https://www.atecup.cn/home.\n","authors":["Zhaoxin Huan","Ke Ding","Ang Li","Xiaolu Zhang","Xu Min","Yong He","Liang Zhang","Jun Zhou","Linjian Mo","Jinjie Gu","Zhongyi Liu","Wenliang Zhong","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v5","updated":"2023-08-31T03:47:35Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding and Improving Adversarial\n Transferability from Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v5.pdf","comment":"IEEE Symposium on Security and Privacy (Oakland) 2024; Extended\n version of camera-ready"},{"id":"http://arxiv.org/abs/2308.16425v1","updated":"2023-08-31T03:28:43Z","published":"2023-08-31T03:28:43Z","title":"On the Equivalence between Implicit and Explicit Neural Networks: A\n High-dimensional Viewpoint","summary":" Implicit neural networks have demonstrated remarkable success in various\ntasks. However, there is a lack of theoretical analysis of the connections and\ndifferences between implicit and explicit networks. In this paper, we study\nhigh-dimensional implicit neural networks and provide the high dimensional\nequivalents for the corresponding conjugate kernels and neural tangent kernels.\nBuilt upon this, we establish the equivalence between implicit and explicit\nnetworks in high dimensions.\n","authors":["Zenan Ling","Zhenyu Liao","Robert C. Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.16425v1.pdf","comment":"Accepted by Workshop on High-dimensional Learning Dynamics, ICML\n 2023, Honolulu, Hawaii"},{"id":"http://arxiv.org/abs/2308.16422v1","updated":"2023-08-31T03:16:38Z","published":"2023-08-31T03:16:38Z","title":"DECODE: DilatEd COnvolutional neural network for Detecting\n Extreme-mass-ratio inspirals","summary":" The detection of Extreme Mass Ratio Inspirals (EMRIs) is intricate due to\ntheir complex waveforms, extended duration, and low signal-to-noise ratio\n(SNR), making them more challenging to be identified compared to compact binary\ncoalescences. While matched filtering-based techniques are known for their\ncomputational demands, existing deep learning-based methods primarily handle\ntime-domain data and are often constrained by data duration and SNR. In\naddition, most existing work ignores time-delay interferometry (TDI) and\napplies the long-wavelength approximation in detector response calculations,\nthus limiting their ability to handle laser frequency noise. In this study, we\nintroduce DECODE, an end-to-end model focusing on EMRI signal detection by\nsequence modeling in the frequency domain. Centered around a dilated causal\nconvolutional neural network, trained on synthetic data considering TDI-1.5\ndetector response, DECODE can efficiently process a year's worth of\nmultichannel TDI data with an SNR of around 50. We evaluate our model on 1-year\ndata with accumulated SNR ranging from 50 to 120 and achieve a true positive\nrate of 96.3% at a false positive rate of 1%, keeping an inference time of less\nthan 0.01 seconds. With the visualization of three showcased EMRI signals for\ninterpretability and generalization, DECODE exhibits strong potential for\nfuture space-based gravitational wave data analyses.\n","authors":["Tianyu Zhao","Yue Zhou","Ruijun Shi","Zhoujian Cao","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.16422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.01129v2","updated":"2023-08-31T03:01:32Z","published":"2021-04-02T16:10:24Z","title":"Simulation-Based Optimization of User Interfaces for Quality-Assuring\n Machine Learning Model Predictions","summary":" Quality-sensitive applications of machine learning (ML) require quality\nassurance (QA) by humans before the predictions of an ML model can be deployed.\nQA for ML (QA4ML) interfaces require users to view a large amount of data and\nperform many interactions to correct errors made by the ML model. An optimized\nuser interface (UI) can significantly reduce interaction costs. While UI\noptimization can be informed by user studies evaluating design options, this\napproach is not scalable because there are typically numerous small variations\nthat can affect the efficiency of a QA4ML interface. Hence, we propose using\nsimulation to evaluate and aid the optimization of QA4ML interfaces. In\nparticular, we focus on simulating the combined effects of human intelligence\nin initiating appropriate interaction commands and machine intelligence in\nproviding algorithmic assistance for accelerating QA4ML processes. As QA4ML is\nusually labor-intensive, we use the simulated task completion time as the\nmetric for UI optimization under different interface and algorithm setups. We\ndemonstrate the usage of this UI design method in several QA4ML applications.\n","authors":["Yu Zhang","Martijn Tennekes","Tim de Jong","Lyana Curier","Bob Coecke","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2104.01129v2.pdf","comment":"Published in ACM Transactions on Interactive Intelligent Systems"},{"id":"http://arxiv.org/abs/2206.09429v4","updated":"2023-08-31T02:46:33Z","published":"2022-06-19T15:09:23Z","title":"Extending regionalization algorithms to explore spatial process\n heterogeneity","summary":" In spatial regression models, spatial heterogeneity may be considered with\neither continuous or discrete specifications. The latter is related to\ndelineation of spatially connected regions with homogeneous relationships\nbetween variables (spatial regimes). Although various regionalization\nalgorithms have been proposed and studied in the field of spatial analytics,\nmethods to optimize spatial regimes have been largely unexplored. In this\npaper, we propose two new algorithms for spatial regime delineation, two-stage\nK-Models and Regional-K-Models. We also extend the classic Automatic Zoning\nProcedure to spatial regression context. The proposed algorithms are applied to\na series of synthetic datasets and two real-world datasets. Results indicate\nthat all three algorithms achieve superior or comparable performance to\nexisting approaches, while the two-stage K-Models algorithm largely outperforms\nexisting approaches on model fitting, region reconstruction, and coefficient\nestimation. Our work enriches the spatial analytics toolbox to explore spatial\nheterogeneous processes.\n","authors":["Hao Guo","Andre Python","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2206.09429v4.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.03312v5","updated":"2023-08-31T02:29:36Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":" Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07446v2","updated":"2023-08-31T02:28:41Z","published":"2023-02-15T03:32:33Z","title":"On-Demand Communication for Asynchronous Multi-Agent Bandits","summary":" This paper studies a cooperative multi-agent multi-armed stochastic bandit\nproblem where agents operate asynchronously -- agent pull times and rates are\nunknown, irregular, and heterogeneous -- and face the same instance of a\nK-armed bandit problem. Agents can share reward information to speed up the\nlearning process at additional communication costs. We propose ODC, an\non-demand communication protocol that tailors the communication of each pair of\nagents based on their empirical pull times. ODC is efficient when the pull\ntimes of agents are highly heterogeneous, and its communication complexity\ndepends on the empirical pull times of agents. ODC is a generic protocol that\ncan be integrated into most cooperative bandit algorithms without degrading\ntheir performance. We then incorporate ODC into the natural extensions of UCB\nand AAE algorithms and propose two communication-efficient cooperative\nalgorithms. Our analysis shows that both algorithms are near-optimal in regret.\n","authors":["Yu-Zhen Janice Chen","Lin Yang","Xuchuang Wang","Xutong Liu","Mohammad Hajiesmaili","John C. S. Lui","Don Towsley"],"pdf_url":"https://arxiv.org/pdf/2302.07446v2.pdf","comment":"Accepted by AISTATS 2023"},{"id":"http://arxiv.org/abs/2208.00780v5","updated":"2023-08-31T02:27:48Z","published":"2022-07-26T10:59:42Z","title":"Visual correspondence-based explanations improve AI robustness and\n human-AI team accuracy","summary":" Explaining artificial intelligence (AI) predictions is increasingly important\nand even imperative in many high-stakes applications where humans are the\nultimate decision-makers. In this work, we propose two novel architectures of\nself-interpretable image classifiers that first explain, and then predict (as\nopposed to post-hoc explanations) by harnessing the visual correspondences\nbetween a query image and exemplars. Our models consistently improve (by 1 to 4\npoints) on out-of-distribution (OOD) datasets while performing marginally worse\n(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest\nneighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB,\nour correspondence-based explanations are found to be more useful to users than\nkNN explanations. Our explanations help users more accurately reject AI's wrong\ndecisions than all other tested methods. Interestingly, for the first time, we\nshow that it is possible to achieve complementary human-AI team accuracy (i.e.,\nthat is higher than either AI-alone or human-alone), in ImageNet and CUB image\nclassification tasks.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2208.00780v5.pdf","comment":"NeurIPS 2022 conference paper"},{"id":"http://arxiv.org/abs/2308.15690v2","updated":"2023-08-31T02:21:20Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v2.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"},{"id":"http://arxiv.org/abs/2308.16406v1","updated":"2023-08-31T02:20:25Z","published":"2023-08-31T02:20:25Z","title":"CktGNN: Circuit Graph Neural Network for Electronic Design Automation","summary":" The electronic design automation of analog circuits has been a longstanding\nchallenge in the integrated circuit field due to the huge design space and\ncomplex design trade-offs among circuit specifications. In the past decades,\nintensive research efforts have mostly been paid to automate the transistor\nsizing with a given circuit topology. By recognizing the graph nature of\ncircuits, this paper presents a Circuit Graph Neural Network (CktGNN) that\nsimultaneously automates the circuit topology generation and device sizing\nbased on the encoder-dependent optimization subroutines. Particularly, CktGNN\nencodes circuit graphs using a two-level GNN framework (of nested GNN) where\ncircuits are represented as combinations of subgraphs in a known subgraph\nbasis. In this way, it significantly improves design efficiency by reducing the\nnumber of subgraphs to perform message passing. Nonetheless, another critical\nroadblock to advancing learning-assisted circuit design automation is a lack of\npublic benchmarks to perform canonical assessment and reproducible research. To\ntackle the challenge, we introduce Open Circuit Benchmark (OCB), an\nopen-sourced dataset that contains $10$K distinct operational amplifiers with\ncarefully-extracted circuit specifications. OCB is also equipped with\ncommunicative circuit generation and evaluation capabilities such that it can\nhelp to generalize CktGNN to design various analog circuits by producing\ncorresponding datasets. Experiments on OCB show the extraordinary advantages of\nCktGNN through representation-based optimization frameworks over other recent\npowerful GNN baselines and human experts' manual designs. Our work paves the\nway toward a learning-based open-sourced design automation for analog circuits.\nOur source code is available at \\url{https://github.com/zehao-dong/CktGNN}.\n","authors":["Zehao Dong","Weidong Cao","Muhan Zhang","Dacheng Tao","Yixin Chen","Xuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16406v1.pdf","comment":"Accepted by ICLR (International Conference on Learning\n Representations) 2023"},{"id":"http://arxiv.org/abs/2308.16403v1","updated":"2023-08-31T02:12:46Z","published":"2023-08-31T02:12:46Z","title":"Balancing between the Local and Global Structures (LGS) in Graph\n Embedding","summary":" We present a method for balancing between the Local and Global Structures\n(LGS) in graph embedding, via a tunable parameter. Some embedding methods aim\nto capture global structures, while others attempt to preserve local\nneighborhoods. Few methods attempt to do both, and it is not always possible to\ncapture well both local and global information in two dimensions, which is\nwhere most graph drawing live. The choice of using a local or a global\nembedding for visualization depends not only on the task but also on the\nstructure of the underlying data, which may not be known in advance. For a\ngiven graph, LGS aims to find a good balance between the local and global\nstructure to preserve. We evaluate the performance of LGS with synthetic and\nreal-world datasets and our results indicate that it is competitive with the\nstate-of-the-art methods, using established quality metrics such as stress and\nneighborhood preservation. We introduce a novel quality metric, cluster\ndistance preservation, to assess intermediate structure capture. All\nsource-code, datasets, experiments and analysis are available online.\n","authors":["Jacob Miller","Vahan Huroyan","Stephen Kobourov"],"pdf_url":"https://arxiv.org/pdf/2308.16403v1.pdf","comment":"Appears in the Proceedings of the 31st International Symposium on\n Graph Drawing and Network Visualization (GD 2023)"},{"id":"http://arxiv.org/abs/2305.11304v2","updated":"2023-08-31T02:10:40Z","published":"2023-05-16T07:00:57Z","title":"pTSE: A Multi-model Ensemble Method for Probabilistic Time Series\n Forecasting","summary":" Various probabilistic time series forecasting models have sprung up and shown\nremarkably good performance. However, the choice of model highly relies on the\ncharacteristics of the input time series and the fixed distribution that the\nmodel is based on. Due to the fact that the probability distributions cannot be\naveraged over different models straightforwardly, the current time series model\nensemble methods cannot be directly applied to improve the robustness and\naccuracy of forecasting. To address this issue, we propose pTSE, a multi-model\ndistribution ensemble method for probabilistic forecasting based on Hidden\nMarkov Model (HMM). pTSE only takes off-the-shelf outputs from member models\nwithout requiring further information about each model. Besides, we provide a\ncomplete theoretical analysis of pTSE to prove that the empirical distribution\nof time series subject to an HMM will converge to the stationary distribution\nalmost surely. Experiments on benchmarks show the superiority of pTSE overall\nmember models and competitive ensemble methods.\n","authors":["Yunyi Zhou","Zhixuan Chu","Yijia Ruan","Ge Jin","Yuchen Huang","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2305.11304v2.pdf","comment":"The 32nd International Joint Conference on Artificial Intelligence\n (IJCAI 2023)"},{"id":"http://arxiv.org/abs/2308.13570v2","updated":"2023-08-31T02:01:13Z","published":"2023-08-25T05:52:41Z","title":"Stochastic Configuration Machines for Industrial Artificial Intelligence","summary":" Real-time predictive modelling with desired accuracy is highly expected in\nindustrial artificial intelligence (IAI), where neural networks play a key\nrole. Neural networks in IAI require powerful, high-performance computing\ndevices to operate a large number of floating point data. Based on stochastic\nconfiguration networks (SCNs), this paper proposes a new randomized learner\nmodel, termed stochastic configuration machines (SCMs), to stress effective\nmodelling and data size saving that are useful and valuable for industrial\napplications. Compared to SCNs and random vector functional-link (RVFL) nets\nwith binarized implementation, the model storage of SCMs can be significantly\ncompressed while retaining favourable prediction performance. Besides the\narchitecture of the SCM learner model and its learning algorithm, as an\nimportant part of this contribution, we also provide a theoretical basis on the\nlearning capacity of SCMs by analysing the model's complexity. Experimental\nstudies are carried out over some benchmark datasets and three industrial\napplications. The results demonstrate that SCM has great potential for dealing\nwith industrial data analytics.\n","authors":["Dianhui Wang","Matthew J. Felicetti"],"pdf_url":"https://arxiv.org/pdf/2308.13570v2.pdf","comment":"23 pages, 7 figures, 12 tables"},{"id":"http://arxiv.org/abs/2308.16391v1","updated":"2023-08-31T01:54:31Z","published":"2023-08-31T01:54:31Z","title":"Improving Robustness and Accuracy of Ponzi Scheme Detection on Ethereum\n Using Time-Dependent Features","summary":" The rapid development of blockchain has led to more and more funding pouring\ninto the cryptocurrency market, which also attracted cybercriminals' interest\nin recent years. The Ponzi scheme, an old-fashioned fraud, is now popular on\nthe blockchain, causing considerable financial losses to many crypto-investors.\nA few Ponzi detection methods have been proposed in the literature, most of\nwhich detect a Ponzi scheme based on its smart contract source code or opcode.\nThe contract-code-based approach, while achieving very high accuracy, is not\nrobust: first, the source codes of a majority of contracts on Ethereum are not\navailable, and second, a Ponzi developer can fool a contract-code-based\ndetection model by obfuscating the opcode or inventing a new profit\ndistribution logic that cannot be detected (since these models were trained on\nexisting Ponzi logics only). A transaction-based approach could improve the\nrobustness of detection because transactions, unlike smart contracts, are\nharder to be manipulated. However, the current transaction-based detection\nmodels achieve fairly low accuracy. We address this gap in the literature by\ndeveloping new detection models that rely only on the transactions, hence\nguaranteeing the robustness, and moreover, achieve considerably higher\nAccuracy, Precision, Recall, and F1-score than existing transaction-based\nmodels. This is made possible thanks to the introduction of novel\ntime-dependent features that capture Ponzi behaviours characteristics derived\nfrom our comprehensive data analyses on Ponzi and non-Ponzi data from the\nXBlock-ETH repository\n","authors":["Phuong Duy Huynh","Son Hoang Dau","Xiaodong Li","Phuc Luong","Emanuele Viterbo"],"pdf_url":"https://arxiv.org/pdf/2308.16391v1.pdf","comment":"17 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2201.12994v4","updated":"2023-08-31T01:38:14Z","published":"2022-01-31T04:15:42Z","title":"MGNN: Graph Neural Networks Inspired by Distance Geometry Problem","summary":" Graph Neural Networks (GNNs) have emerged as a prominent research topic in\nthe field of machine learning. Existing GNN models are commonly categorized\ninto two types: spectral GNNs, which are designed based on polynomial graph\nfilters, and spatial GNNs, which utilize a message-passing scheme as the\nfoundation of the model. For the expressive power and universality of spectral\nGNNs, a natural approach is to improve the design of basis functions for better\napproximation ability. As for spatial GNNs, models like Graph Isomorphism\nNetworks (GIN) analyze their expressive power based on Graph Isomorphism Tests.\nRecently, there have been attempts to establish connections between spatial\nGNNs and geometric concepts like curvature and cellular sheaves, as well as\nphysical phenomena like oscillators. However, despite the recent progress,\nthere is still a lack of comprehensive analysis regarding the universality of\nspatial GNNs from the perspectives of geometry and physics. In this paper, we\npropose MetricGNN (MGNN), a spatial GNN model inspired by the\ncongruent-insensitivity property of classifiers in the classification phase of\nGNNs. We demonstrate that a GNN model is universal in the spatial domain if it\ncan generate embedding matrices that are congruent to any given embedding\nmatrix. This property is closely related to the Distance Geometry Problem\n(DGP). Since DGP is an NP-Hard combinatorial optimization problem, we propose\noptimizing an energy function derived from spring networks and the\nMulti-Dimensional Scaling (MDS) problem. This approach also allows our model to\nhandle both homophilic and heterophilic graphs. Finally, we propose employing\nthe iteration method to optimize our energy function. We extensively evaluate\nthe effectiveness of our model through experiments conducted on both synthetic\nand real-world datasets. Our code is available at:\nhttps://github.com/GuanyuCui/MGNN.\n","authors":["Guanyu Cui","Zhewei Wei"],"pdf_url":"https://arxiv.org/pdf/2201.12994v4.pdf","comment":"Accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2302.12977v3","updated":"2023-08-31T01:28:35Z","published":"2023-02-25T04:12:30Z","title":"Fair Attribute Completion on Graph with Missing Attributes","summary":" Tackling unfairness in graph learning models is a challenging task, as the\nunfairness issues on graphs involve both attributes and topological structures.\nExisting work on fair graph learning simply assumes that attributes of all\nnodes are available for model training and then makes fair predictions. In\npractice, however, the attributes of some nodes might not be accessible due to\nmissing data or privacy concerns, which makes fair graph learning even more\nchallenging. In this paper, we propose FairAC, a fair attribute completion\nmethod, to complement missing information and learn fair node embeddings for\ngraphs with missing attributes. FairAC adopts an attention mechanism to deal\nwith the attribute missing problem and meanwhile, it mitigates two types of\nunfairness, i.e., feature unfairness from attributes and topological unfairness\ndue to attribute completion. FairAC can work on various types of homogeneous\ngraphs and generate fair embeddings for them and thus can be applied to most\ndownstream tasks to improve their fairness performance. To our best knowledge,\nFairAC is the first method that jointly addresses the graph attribution\ncompletion and graph unfairness problems. Experimental results on benchmark\ndatasets show that our method achieves better fairness performance with less\nsacrifice in accuracy, compared with the state-of-the-art methods of fair graph\nlearning. Code is available at: https://github.com/donglgcn/FairAC.\n","authors":["Dongliang Guo","Zhixuan Chu","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2302.12977v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15449v2","updated":"2023-08-31T01:11:10Z","published":"2023-02-23T23:51:44Z","title":"Backpropagation through Back Substitution with a Backslash","summary":" We present a linear algebra formulation of backpropagation which allows the\ncalculation of gradients by using a generically written ``backslash'' or\nGaussian elimination on triangular systems of equations. Generally, the matrix\nelements are operators. This paper has three contributions: (i) it is of\nintellectual value to replace traditional treatments of automatic\ndifferentiation with a (left acting) operator theoretic, graph-based approach;\n(ii) operators can be readily placed in matrices in software in programming\nlanguages such as Julia as an implementation option; (iii) we introduce a novel\nnotation, ``transpose dot'' operator ``$\\{\\}^{T_\\bullet}$'' that allows for the\nreversal of operators.\n We further demonstrate the elegance of the operators approach in a suitable\nprogramming language consisting of generic linear algebra operators such as\nJulia \\cite{bezanson2017julia}, and that it is possible to realize this\nabstraction in code. Our implementation shows how generic linear algebra can\nallow operators as elements of matrices. In contrast to ``operator\noverloading,'' where backslash would normally have to be rewritten to take\nadvantage of operators, with ``generic programming'' there is no such need.\n","authors":["Alan Edelman","Ekin Akyurek","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15449v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2308.16385v1","updated":"2023-08-31T01:03:27Z","published":"2023-08-31T01:03:27Z","title":"BenchTemp: A General Benchmark for Evaluating Temporal Graph Neural\n Networks","summary":" To handle graphs in which features or connectivities are evolving over time,\na series of temporal graph neural networks (TGNNs) have been proposed. Despite\nthe success of these TGNNs, the previous TGNN evaluations reveal several\nlimitations regarding four critical issues: 1) inconsistent datasets, 2)\ninconsistent evaluation pipelines, 3) lacking workload diversity, and 4)\nlacking efficient comparison. Overall, there lacks an empirical study that puts\nTGNN models onto the same ground and compares them comprehensively. To this\nend, we propose BenchTemp, a general benchmark for evaluating TGNN models on\nvarious workloads. BenchTemp provides a set of benchmark datasets so that\ndifferent TGNN models can be fairly compared. Further, BenchTemp engineers a\nstandard pipeline that unifies the TGNN evaluation. With BenchTemp, we\nextensively compare the representative TGNN models on different tasks (e.g.,\nlink prediction and node classification) and settings (transductive and\ninductive), w.r.t. both effectiveness and efficiency metrics. We have made\nBenchTemp publicly available at https://github.com/qianghuangwhu/benchtemp.\n","authors":["Qiang Huang","Jiawei Jiang","Xi Susie Rao","Ce Zhang","Zhichao Han","Zitao Zhang","Xin Wang","Yongjun He","Quanqing Xu","Yang Zhao","Chuang Hu","Shuo Shang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2308.16385v1.pdf","comment":"28 pages, 23 figures, 27 tables. Submitted to the Conference on\n Neural Information Processing Systems 2023 Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2308.16379v1","updated":"2023-08-31T00:47:58Z","published":"2023-08-31T00:47:58Z","title":"Multi-Objective Decision Transformers for Offline Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) is structured to derive policies from\nstatic trajectory data without requiring real-time environment interactions.\nRecent studies have shown the feasibility of framing offline RL as a sequence\nmodeling task, where the sole aim is to predict actions based on prior context\nusing the transformer architecture. However, the limitation of this single task\nlearning approach is its potential to undermine the transformer model's\nattention mechanism, which should ideally allocate varying attention weights\nacross different tokens in the input context for optimal prediction. To address\nthis, we reformulate offline RL as a multi-objective optimization problem,\nwhere the prediction is extended to states and returns. We also highlight a\npotential flaw in the trajectory representation used for sequence modeling,\nwhich could generate inaccuracies when modeling the state and return\ndistributions. This is due to the non-smoothness of the action distribution\nwithin the trajectory dictated by the behavioral policy. To mitigate this\nissue, we introduce action space regions to the trajectory representation. Our\nexperiments on D4RL benchmark locomotion tasks reveal that our propositions\nallow for more effective utilization of the attention mechanism in the\ntransformer model, resulting in performance that either matches or outperforms\ncurrent state-of-the art methods.\n","authors":["Abdelghani Ghanem","Philippe Ciblat","Mounir Ghogho"],"pdf_url":"https://arxiv.org/pdf/2308.16379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16375v1","updated":"2023-08-31T00:31:08Z","published":"2023-08-31T00:31:08Z","title":"A Survey on Privacy in Graph Neural Networks: Attacks, Preservation, and\n Applications","summary":" Graph Neural Networks (GNNs) have gained significant attention owing to their\nability to handle graph-structured data and the improvement in practical\napplications. However, many of these models prioritize high utility\nperformance, such as accuracy, with a lack of privacy consideration, which is a\nmajor concern in modern society where privacy attacks are rampant. To address\nthis issue, researchers have started to develop privacy-preserving GNNs.\nDespite this progress, there is a lack of a comprehensive overview of the\nattacks and the techniques for preserving privacy in the graph domain. In this\nsurvey, we aim to address this gap by summarizing the attacks on graph data\naccording to the targeted information, categorizing the privacy preservation\ntechniques in GNNs, and reviewing the datasets and applications that could be\nused for analyzing/solving privacy issues in GNNs. We also outline potential\ndirections for future research in order to build better privacy-preserving\nGNNs.\n","authors":["Yi Zhang","Yuying Zhao","Zhaoqing Li","Xueqi Cheng","Yu Wang","Olivera Kotevska","Philip S. Yu","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.16375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.04348v3","updated":"2023-08-31T00:23:22Z","published":"2022-04-09T01:48:41Z","title":"Neuronal diversity can improve machine learning for physics and beyond","summary":" Diversity conveys advantages in nature, yet homogeneous neurons typically\ncomprise the layers of artificial neural networks. Here we construct neural\nnetworks from neurons that learn their own activation functions, quickly\ndiversify, and subsequently outperform their homogeneous counterparts on image\nclassification and nonlinear regression tasks. Sub-networks instantiate the\nneurons, which meta-learn especially efficient sets of nonlinear responses.\nExamples include conventional neural networks classifying digits and\nforecasting a van der Pol oscillator and physics-informed Hamiltonian neural\nnetworks learning H\\'enon-Heiles stellar orbits and the swing of a video\nrecorded pendulum clock. Such \\textit{learned diversity} provides examples of\ndynamical systems selecting diversity over uniformity and elucidates the role\nof diversity in natural and artificial systems.\n","authors":["Anshul Choudhary","Anil Radhakrishnan","John F. Lindner","Sudeshna Sinha","William L. Ditto"],"pdf_url":"https://arxiv.org/pdf/2204.04348v3.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2102.04307v3","updated":"2023-08-31T00:20:06Z","published":"2021-02-08T16:10:50Z","title":"Learning Optimal Strategies for Temporal Tasks in Stochastic Games","summary":" Synthesis from linear temporal logic (LTL) specifications provides assured\ncontrollers for systems operating in stochastic and potentially adversarial\nenvironments. Automatic synthesis tools, however, require a model of the\nenvironment to construct controllers. In this work, we introduce a model-free\nreinforcement learning (RL) approach to derive controllers from given LTL\nspecifications even when the environment is completely unknown. We model the\nproblem as a stochastic game (SG) between the controller and the adversarial\nenvironment; we then learn optimal control strategies that maximize the\nprobability of satisfying the LTL specifications against the worst-case\nenvironment behavior. We first construct a product game using the deterministic\nparity automaton (DPA) translated from the given LTL specification. By deriving\ndistinct rewards and discount factors from the acceptance condition of the DPA,\nwe reduce the maximization of the worst-case probability of satisfying the LTL\nspecification into the maximization of a discounted reward objective in the\nproduct game; this enables the use of model-free RL algorithms to learn an\noptimal controller strategy. To deal with the common scalability problems when\nthe number of sets defining the acceptance condition of the DPA (usually\nreferred as colors), is large, we propose a lazy color generation method where\ndistinct rewards and discount factors are utilized only when needed, and an\napproximate method where the controller eventually focuses on only one color.\nIn several case studies, we show that our approach is scalable to a wide range\nof LTL formulas, significantly outperforming existing methods for learning\ncontrollers from LTL specifications in SGs.\n","authors":["Alper Kamil Bozkurt","Yu Wang","Michael M. Zavlanos","Miroslav Pajic"],"pdf_url":"https://arxiv.org/pdf/2102.04307v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16369v1","updated":"2023-08-31T00:03:02Z","published":"2023-08-31T00:03:02Z","title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked\n Prefills","summary":" Large Language Model (LLM) inference consists of two distinct phases -\nprefill phase which processes the input prompt and decode phase which generates\noutput tokens autoregressively. While the prefill phase effectively saturates\nGPU compute at small batch sizes, the decode phase results in low compute\nutilization as it generates one token at a time per request. The varying\nprefill and decode times also lead to imbalance across micro-batches when using\npipeline parallelism, resulting in further inefficiency due to bubbles.\n We present SARATHI to address these challenges. SARATHI employs\nchunked-prefills, which splits a prefill request into equal sized chunks, and\ndecode-maximal batching, which constructs a batch using a single prefill chunk\nand populates the remaining slots with decodes. During inference, the prefill\nchunk saturates GPU compute, while the decode requests 'piggyback' and cost up\nto an order of magnitude less compared to a decode-only batch. Chunked-prefills\nallows constructing multiple decode-maximal batches from a single prefill\nrequest, maximizing coverage of decodes that can piggyback. Furthermore, the\nuniform compute design of these batches ameliorates the imbalance between\nmicro-batches, significantly reducing pipeline bubbles.\n Our techniques yield significant improvements in inference performance across\nmodels and hardware. For the LLaMA-13B model on A6000 GPU, SARATHI improves\ndecode throughput by up to 10x, and accelerates end-to-end throughput by up to\n1.33x. For LLaMa-33B on A100 GPU, we achieve 1.25x higher end-to-end-throughput\nand up to 4.25x higher decode throughput. When used with pipeline parallelism\non GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end\nthroughput improvement of 1.91x.\n","authors":["Amey Agrawal","Ashish Panwar","Jayashree Mohan","Nipun Kwatra","Bhargav S. Gulavani","Ramachandran Ramjee"],"pdf_url":"https://arxiv.org/pdf/2308.16369v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.09312v2","updated":"2023-08-31T15:32:01Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks, such as Reddit discussions. In contrast to traditional comment-only\nmethods, our approach to labelling a comment as hate speech involves a holistic\nanalysis of text and images grounded in the discussion context. This is done by\nleveraging graph transformers to capture the contextual relationships in the\nentire discussion surrounding a comment and grounding the interwoven fusion\nlayers that combine individual comments' text and image embeddings instead of\nprocessing modalities separately. We compare the performance of our model to\nbaselines that only process individual comments and conduct extensive ablation\nstudies. To evaluate our work, we present a new dataset, HatefulDiscussions,\ncomprising complete multi-modal discussions from multiple online communities on\nReddit. We conclude with future work for multimodal solutions to deliver social\nvalue in online contexts, arguing that capturing a holistic view of a\nconversation significantly advances the effort to detect anti-social behaviour.\n","authors":["Liam Hebert","Gaurav Sahu","Yuxuan Guo","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2308.16725v1","updated":"2023-08-31T13:41:34Z","published":"2023-08-31T13:41:34Z","title":"Terrain Diffusion Network: Climatic-Aware Terrain Generation with\n Geological Sketch Guidance","summary":" Sketch-based terrain generation seeks to create realistic landscapes for\nvirtual environments in various applications such as computer games, animation\nand virtual reality. Recently, deep learning based terrain generation has\nemerged, notably the ones based on generative adversarial networks (GAN).\nHowever, these methods often struggle to fulfill the requirements of flexible\nuser control and maintain generative diversity for realistic terrain.\nTherefore, we propose a novel diffusion-based method, namely terrain diffusion\nnetwork (TDN), which actively incorporates user guidance for enhanced\ncontrollability, taking into account terrain features like rivers, ridges,\nbasins, and peaks. Instead of adhering to a conventional monolithic denoising\nprocess, which often compromises the fidelity of terrain details or the\nalignment with user control, a multi-level denoising scheme is proposed to\ngenerate more realistic terrains by taking into account fine-grained details,\nparticularly those related to climatic patterns influenced by erosion and\ntectonic activities. Specifically, three terrain synthesisers are designed for\nstructural, intermediate, and fine-grained level denoising purposes, which\nallow each synthesiser concentrate on a distinct terrain aspect. Moreover, to\nmaximise the efficiency of our TDN, we further introduce terrain and sketch\nlatent spaces for the synthesizers with pre-trained terrain autoencoders.\nComprehensive experiments on a new dataset constructed from NASA Topology\nImages clearly demonstrate the effectiveness of our proposed method, achieving\nthe state-of-the-art performance. Our code and dataset will be publicly\navailable.\n","authors":["Zexin Hu","Kun Hu","Clinton Mo","Lei Pan","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16418v1","updated":"2023-08-31T03:09:25Z","published":"2023-08-31T03:09:25Z","title":"End-Edge Coordinated Joint Encoding and Neural Enhancement for Low-Light\n Video Analytics","summary":" In this paper, we investigate video analytics in low-light environments, and\npropose an end-edge coordinated system with joint video encoding and\nenhancement. It adaptively transmits low-light videos from cameras and performs\nenhancement and inference tasks at the edge. Firstly, according to our\nobservations, both encoding and enhancement for low-light videos have a\nsignificant impact on inference accuracy, which directly influences bandwidth\nand computation overhead. Secondly, due to the limitation of built-in\ncomputation resources, cameras perform encoding and transmitting frames to the\nedge. The edge executes neural enhancement to process low contrast, detail\nloss, and color distortion on low-light videos before inference. Finally, an\nadaptive controller is designed at the edge to select quantization parameters\nand scales of neural enhancement networks, aiming to improve the inference\naccuracy and meet the latency requirements. Extensive real-world experiments\ndemon-strate that, the proposed system can achieve a better trade-off between\ncommunication and computation resources and optimize the inference accuracy.\n","authors":["Yuanyi He","Peng Yang","Tian Qin","Ning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16417v1","updated":"2023-08-31T03:03:29Z","published":"2023-08-31T03:03:29Z","title":"Edge-Assisted Lightweight Region-of-Interest Extraction and Transmission\n for Vehicle Perception","summary":" To enhance on-road environmental perception for autonomous driving, accurate\nand real-time analytics on high-resolution video frames generated from on-board\ncameras be-comes crucial. In this paper, we design a lightweight object\nlocation method based on class activation mapping (CAM) to rapidly capture the\nregion of interest (RoI) boxes that contain driving safety related objects from\non-board cameras, which can not only improve the inference accuracy of vision\ntasks, but also reduce the amount of transmitted data. Considering the limited\non-board computation resources, the RoI boxes extracted from the raw image are\noffloaded to the edge for further processing. Considering both the dynamics of\nvehicle-to-edge communications and the limited edge resources, we propose an\nadaptive RoI box offloading algorithm to ensure prompt and accurate inference\nby adjusting the down-sampling rate of each box. Extensive experimental results\non four high-resolution video streams demonstrate that our approach can\neffectively improve the overall accuracy by up to 16% and reduce the\ntransmission demand by up to 49%, compared with other benchmarks.\n","authors":["Yan Cheng","Peng Yang","Ning Zhang","Jiawei Hou"],"pdf_url":"https://arxiv.org/pdf/2308.16417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16413v1","updated":"2023-08-31T02:54:30Z","published":"2023-08-31T02:54:30Z","title":"Edge-Assisted On-Device Model Update for Video Analytics in Adverse\n Environments","summary":" While large deep neural networks excel at general video analytics tasks, the\nsignificant demand on computing capacity makes them infeasible for real-time\ninference on resource-constrained end cam-eras. In this paper, we propose an\nedge-assisted framework that continuously updates the lightweight model\ndeployed on the end cameras to achieve accurate predictions in adverse\nenvironments. This framework consists of three modules, namely, a key frame\nextractor, a trigger controller, and a retraining manager. The low-cost key\nframe extractor obtains frames that can best represent the current environment.\nThose frames are then transmitted and buffered as the retraining data for model\nupdate at the edge server. Once the trigger controller detects a significant\naccuracy drop in the selected frames, the retraining manager outputs the\noptimal retraining configuration balancing the accuracy and time cost. We\nprototype our system on two end devices of different computing capacities with\none edge server. The results demonstrate that our approach significantly\nimproves accuracy across all tested adverse environment scenarios (up to 24%)\nand reduces more than 50% of the retraining time compared to existing\nbenchmarks.\n","authors":["Yuxin Kong","Peng Yang","Yan Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.16413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16383v1","updated":"2023-08-31T01:00:59Z","published":"2023-08-31T01:00:59Z","title":"Separate and Locate: Rethink the Text in Text-based Visual Question\n Answering","summary":" Text-based Visual Question Answering (TextVQA) aims at answering questions\nabout the text in images. Most works in this field focus on designing network\nstructures or pre-training tasks. All these methods list the OCR texts in\nreading order (from left to right and top to bottom) to form a sequence, which\nis treated as a natural language ``sentence''. However, they ignore the fact\nthat most OCR words in the TextVQA task do not have a semantical contextual\nrelationship. In addition, these approaches use 1-D position embedding to\nconstruct the spatial relation between OCR tokens sequentially, which is not\nreasonable. The 1-D position embedding can only represent the left-right\nsequence relationship between words in a sentence, but not the complex spatial\nposition relationship. To tackle these problems, we propose a novel method\nnamed Separate and Locate (SaL) that explores text contextual cues and designs\nspatial position embedding to construct spatial relations between OCR texts.\nSpecifically, we propose a Text Semantic Separate (TSS) module that helps the\nmodel recognize whether words have semantic contextual relations. Then, we\nintroduce a Spatial Circle Position (SCP) module that helps the model better\nconstruct and reason the spatial position relationships between OCR texts. Our\nSaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA\nand ST-VQA datasets. Compared with the pre-training state-of-the-art method\npre-trained on 64 million pre-training samples, our method, without any\npre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on\nTextVQA and ST-VQA. Our code and models will be released at\nhttps://github.com/fangbufang/SaL.\n","authors":["Chengyang Fang","Jiangnan Li","Liang Li","Can Ma","Dayong Hu"],"pdf_url":"https://arxiv.org/pdf/2308.16383v1.pdf","comment":"Accepted by ACM MM 2023"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 41 + +
+
+
+ + ☆ PointLLM: Empowering Large Language Models to Understand Point Clouds + + +
+ The unprecedented advancements in Large Language Models (LLMs) have created a +profound impact on natural language processing but are yet to fully embrace the +realm of 3D understanding. This paper introduces PointLLM, a preliminary effort +to fill this gap, thereby enabling LLMs to understand point clouds and offering +a new avenue beyond 2D visual data. PointLLM processes colored object point +clouds with human instructions and generates contextually appropriate +responses, illustrating its grasp of point clouds and common sense. +Specifically, it leverages a point cloud encoder with a powerful LLM to +effectively fuse geometric, appearance, and linguistic information. We collect +a novel dataset comprising 660K simple and 70K complex point-text instruction +pairs to enable a two-stage training strategy: initially aligning latent spaces +and subsequently instruction-tuning the unified model. To rigorously evaluate +our model's perceptual abilities and its generalization capabilities, we +establish two benchmarks: Generative 3D Object Classification and 3D Object +Captioning, assessed through three different methods, including human +evaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment +results show that PointLLM demonstrates superior performance over existing 2D +baselines. Remarkably, in human-evaluated object captioning tasks, PointLLM +outperforms human annotators in over 50% of the samples. Codes, datasets, and +benchmarks are available at https://github.com/OpenRobotLab/PointLLM . + +
+
+ comment: 19 pages. Empowering large language models with 3D point cloud + understanding, accompanied by a novel dataset and carefully designed + benchmarks. Project page: https://runsenxu.com/projects/PointLLM +
+
+
+
+
+ + ☆ Transformers as Support Vector Machines + + +
+ Since its inception in "Attention Is All You Need", transformer architecture +has led to revolutionary advancements in NLP. The attention layer within the +transformer admits a sequence of input tokens $X$ and makes them interact +through pairwise similarities computed as softmax$(XQK^\top X^\top)$, where +$(K,Q)$ are the trainable key-query parameters. In this work, we establish a +formal equivalence between the optimization geometry of self-attention and a +hard-margin SVM problem that separates optimal input tokens from non-optimal +tokens using linear constraints on the outer-products of token pairs. This +formalism allows us to characterize the implicit bias of 1-layer transformers +optimized with gradient descent: (1) Optimizing the attention layer with +vanishing regularization, parameterized by $(K,Q)$, converges in direction to +an SVM solution minimizing the nuclear norm of the combined parameter +$W=KQ^\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm +objective. We characterize this convergence, highlighting that it can occur +toward locally-optimal directions rather than global ones. (2) Complementing +this, we prove the local/global directional convergence of gradient descent +under suitable geometric conditions. Importantly, we show that +over-parameterization catalyzes global convergence by ensuring the feasibility +of the SVM problem and by guaranteeing a benign optimization landscape devoid +of stationary points. (3) While our theory applies primarily to linear +prediction heads, we propose a more general SVM equivalence that predicts the +implicit bias with nonlinear heads. Our findings are applicable to arbitrary +datasets and their validity is verified via experiments. We also introduce +several open problems and research directions. We believe these findings +inspire the interpretation of transformers as a hierarchy of SVMs that +separates and selects optimal tokens. + +
+
+
+
+
+ + ☆ TouchStone: Evaluating Vision-Language Models by Language Models + + +
+ Large vision-language models (LVLMs) have recently witnessed rapid +advancements, exhibiting a remarkable capacity for perceiving, understanding, +and processing visual information by connecting visual receptor with large +language models (LLMs). However, current assessments mainly focus on +recognizing and reasoning abilities, lacking direct evaluation of +conversational skills and neglecting visual storytelling abilities. In this +paper, we propose an evaluation method that uses strong LLMs as judges to +comprehensively evaluate the various abilities of LVLMs. Firstly, we construct +a comprehensive visual dialogue dataset TouchStone, consisting of open-world +images and questions, covering five major categories of abilities and 27 +subtasks. This dataset not only covers fundamental recognition and +comprehension but also extends to literary creation. Secondly, by integrating +detailed image annotations we effectively transform the multimodal input +content into a form understandable by LLMs. This enables us to employ advanced +LLMs for directly evaluating the quality of the multimodal dialogue without +requiring human intervention. Through validation, we demonstrate that powerful +LVLMs, such as GPT-4, can effectively score dialogue quality by leveraging +their textual capabilities alone, aligning with human preferences. We hope our +work can serve as a touchstone for LVLMs' evaluation and pave the way for +building stronger LVLMs. The evaluation code is available at +https://github.com/OFA-Sys/TouchStone. + +
+
+ comment: https://github.com/OFA-Sys/TouchStone +
+
+
+
+
+ + ☆ The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 + Language Variants + + +
+ We present Belebele, a multiple-choice machine reading comprehension (MRC) +dataset spanning 122 language variants. Significantly expanding the language +coverage of natural language understanding (NLU) benchmarks, this dataset +enables the evaluation of text models in high-, medium-, and low-resource +languages. Each question is based on a short passage from the Flores-200 +dataset and has four multiple-choice answers. The questions were carefully +curated to discriminate between models with different levels of general +language comprehension. The English dataset on its own proves difficult enough +to challenge state-of-the-art language models. Being fully parallel, this +dataset enables direct comparison of model performance across all languages. We +use this dataset to evaluate the capabilities of multilingual masked language +models (MLMs) and large language models (LLMs). We present extensive results +and find that despite significant cross-lingual transfer in English-centric +LLMs, much smaller MLMs pretrained on balanced multilingual data still +understand far more languages. We also observe that larger vocabulary size and +conscious vocabulary construction correlate with better performance on +low-resource languages. Overall, Belebele opens up new avenues for evaluating +and analyzing the multilingual capabilities of NLP systems. + +
+
+ comment: 27 pages, 13 figures +
+
+
+
+
+ + ☆ The Gender-GAP Pipeline: A Gender-Aware Polyglot Pipeline for Gender + Characterisation in 55 Languages + + +
+ Gender biases in language generation systems are challenging to mitigate. One +possible source for these biases is gender representation disparities in the +training and evaluation data. Despite recent progress in documenting this +problem and many attempts at mitigating it, we still lack shared methodology +and tooling to report gender representation in large datasets. Such +quantitative reporting will enable further mitigation, e.g., via data +augmentation. This paper describes the Gender-GAP Pipeline (for Gender-Aware +Polyglot Pipeline), an automatic pipeline to characterize gender representation +in large-scale datasets for 55 languages. The pipeline uses a multilingual +lexicon of gendered person-nouns to quantify the gender representation in text. +We showcase it to report gender representation in WMT training data and +development data for the News task, confirming that current data is skewed +towards masculine representation. Having unbalanced datasets may indirectly +optimize our systems towards outperforming one gender over the others. We +suggest introducing our gender quantification pipeline in current datasets and, +ideally, modifying them toward a balanced representation. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Can Programming Languages Boost Each Other via Instruction Tuning? + + +
+ When human programmers have mastered a programming language, it would be +easier when they learn a new programming language. In this report, we focus on +exploring whether programming languages can boost each other during the +instruction fine-tuning phase of code large language models. We conduct +extensive experiments of 8 popular programming languages (Python, JavaScript, +TypeScript, C, C++, Java, Go, HTML) on StarCoder. Results demonstrate that +programming languages can significantly improve each other. For example, +CodeM-Python 15B trained on Python is able to increase Java by an absolute +17.95% pass@1 on HumanEval-X. More surprisingly, we found that CodeM-HTML 7B +trained on the HTML corpus can improve Java by an absolute 15.24% pass@1. Our +training data is released at https://github.com/NL2Code/CodeM. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Simple LLM Prompting is State-of-the-Art for Robust and Multilingual + Dialogue Evaluation + + +
+ Despite significant research effort in the development of automatic dialogue +evaluation metrics, little thought is given to evaluating dialogues other than +in English. At the same time, ensuring metrics are invariant to semantically +similar responses is also an overlooked topic. In order to achieve the desired +properties of robustness and multilinguality for dialogue evaluation metrics, +we propose a novel framework that takes advantage of the strengths of current +evaluation models with the newly-established paradigm of prompting Large +Language Models (LLMs). Empirical results show our framework achieves state of +the art results in terms of mean Spearman correlation scores across several +benchmarks and ranks first place on both the Robust and Multilingual tasks of +the DSTC11 Track 4 "Automatic Evaluation Metrics for Open-Domain Dialogue +Systems", proving the evaluation capabilities of prompted LLMs. + +
+
+ comment: DSTC11 best paper for Track 4 +
+
+
+
+
+ + ☆ Towards Multilingual Automatic Dialogue Evaluation SIGDIAL23 + + +
+ The main limiting factor in the development of robust multilingual dialogue +evaluation metrics is the lack of multilingual data and the limited +availability of open sourced multilingual dialogue systems. In this work, we +propose a workaround for this lack of data by leveraging a strong multilingual +pretrained LLM and augmenting existing English dialogue data using Machine +Translation. We empirically show that the naive approach of finetuning a +pretrained multilingual encoder model with translated data is insufficient to +outperform the strong baseline of finetuning a multilingual model with only +source data. Instead, the best approach consists in the careful curation of +translated data using MT Quality Estimation metrics, excluding low quality +translations that hinder its performance. + +
+
+ comment: SIGDIAL23 +
+
+
+
+
+ + ☆ Enhancing PLM Performance on Labour Market Tasks via Instruction-based + Finetuning and Prompt-tuning with Rules RecSys + + +
+ The increased digitization of the labour market has given researchers, +educators, and companies the means to analyze and better understand the labour +market. However, labour market resources, although available in high volumes, +tend to be unstructured, and as such, research towards methodologies for the +identification, linking, and extraction of entities becomes more and more +important. Against the backdrop of this quest for better labour market +representations, resource constraints and the unavailability of large-scale +annotated data cause a reliance on human domain experts. We demonstrate the +effectiveness of prompt-based tuning of pre-trained language models (PLM) in +labour market specific applications. Our results indicate that cost-efficient +methods such as PTR and instruction tuning without exemplars can significantly +increase the performance of PLMs on downstream labour market applications +without introducing additional model layers, manual annotations, and data +augmentation. + +
+
+ comment: accepted for publication at RecSys in HR 2023 +
+
+
+
+
+ + ☆ Ladder-of-Thought: Using Knowledge as Steps to Elevate Stance Detection + + +
+ Chain-of-Thought Prompting (CoT) reinforces the reasoning capabilities of +Large Language Models (LLMs) through the generation of intermediate rationales. +However, these enhancements predominantly benefit large-scale models, leaving +small LMs without significant performance improvements when directly applying +CoT. Despite the advanced reasoning capabilities of LLMs, CoT relies primarily +on their pre-trained internal knowledge. The external knowledge that is +previously unknown to the model remains unexploited. This omission becomes +pronounced in tasks such as stance detection, where the external background +knowledge plays a pivotal role. Additionally, the large-scale architecture of +LLMs inevitably present efficiency challenges during deployment. To address +these challenges, we introduce the Ladder-of-Thought (LoT) for stance +detection. Grounded in a dual-phase Cascaded Optimization framework, LoT +directs the model to incorporate high-quality external knowledge, enhancing the +intermediate rationales it generates. These bolstered rationales subsequently +serve as the foundation for more precise predictions - akin to how a ladder +facilitates reaching elevated goals. LoT achieves a balance between efficiency +and accuracy, making it an adaptable and efficient framework for stance +detection. Our empirical evaluations underscore LoT's effectiveness, marking a +16% improvement over ChatGPT and a 10% enhancement compared to ChatGPT with +CoT. + +
+
+ comment: 5 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ CReHate: Cross-cultural Re-annotation of English Hate Speech Dataset + + +
+ English datasets predominantly reflect the perspectives of certain +nationalities, which can lead to cultural biases in models and datasets. This +is particularly problematic in tasks heavily influenced by subjectivity, such +as hate speech detection. To delve into how individuals from different +countries perceive hate speech, we introduce CReHate, a cross-cultural +re-annotation of the sampled SBIC dataset. This dataset includes annotations +from five distinct countries: Australia, Singapore, South Africa, the United +Kingdom, and the United States. Our thorough statistical analysis highlights +significant differences based on nationality, with only 59.4% of the samples +achieving consensus among all countries. We also introduce a culturally +sensitive hate speech classifier via transfer learning, adept at capturing +perspectives of different nationalities. These findings underscore the need to +re-evaluate certain aspects of NLP research, especially with regard to the +nuanced nature of hate speech in the English language. + +
+
+
+
+
+ + ☆ SpeechTokenizer: Unified Speech Tokenizer for Speech Large Language + Models + + +
+ Current speech large language models build upon discrete speech +representations, which can be categorized into semantic tokens and acoustic +tokens. However, existing speech tokens are not specifically designed for +speech language modeling. To assess the suitability of speech tokens for +building speech language models, we established the first benchmark, +SLMTokBench. Our results indicate that neither semantic nor acoustic tokens are +ideal for this purpose. Therefore, we propose SpeechTokenizer, a unified speech +tokenizer for speech large language models. SpeechTokenizer adopts the +Encoder-Decoder architecture with residual vector quantization (RVQ). Unifying +semantic and acoustic tokens, SpeechTokenizer disentangles different aspects of +speech information hierarchically across different RVQ layers. Furthermore, We +construct a Unified Speech Language Model (USLM) leveraging SpeechTokenizer. +Experiments show that SpeechTokenizer performs comparably to EnCodec in speech +reconstruction and demonstrates strong performance on the SLMTokBench +benchmark. Also, USLM outperforms VALL-E in zero-shot Text-to-Speech tasks. +Code and models are available at +https://github.com/ZhangXInFD/SpeechTokenizer/. + +
+
+ comment: SpeechTokenizer project page is + https://0nutation.github.io/SpeechTokenizer.github.io/ +
+
+
+
+
+ + ☆ Using Large Language Models to Automate Category and Trend Analysis of + Scientific Articles: An Application in Ophthalmology + + +
+ Purpose: In this paper, we present an automated method for article +classification, leveraging the power of Large Language Models (LLM). The +primary focus is on the field of ophthalmology, but the model is extendable to +other fields. Methods: We have developed a model based on Natural Language +Processing (NLP) techniques, including advanced LLMs, to process and analyze +the textual content of scientific papers. Specifically, we have employed +zero-shot learning (ZSL) LLM models and compared against Bidirectional and +Auto-Regressive Transformers (BART) and its variants, and Bidirectional Encoder +Representations from Transformers (BERT), and its variant such as distilBERT, +SciBERT, PubmedBERT, BioBERT. Results: The classification results demonstrate +the effectiveness of LLMs in categorizing large number of ophthalmology papers +without human intervention. Results: To evalute the LLMs, we compiled a dataset +(RenD) of 1000 ocular disease-related articles, which were expertly annotated +by a panel of six specialists into 15 distinct categories. The model achieved +mean accuracy of 0.86 and mean F1 of 0.85 based on the RenD dataset. +Conclusion: The proposed framework achieves notable improvements in both +accuracy and efficiency. Its application in the domain of ophthalmology +showcases its potential for knowledge organization and retrieval in other +domains too. We performed trend analysis that enables the researchers and +clinicians to easily categorize and retrieve relevant papers, saving time and +effort in literature review and information gathering as well as identification +of emerging scientific trends within different disciplines. Moreover, the +extendibility of the model to other scientific fields broadens its impact in +facilitating research and trend analysis across diverse disciplines. + +
+
+
+
+
+ + ☆ DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew + + +
+ We present DictaBERT, a new state-of-the-art pre-trained BERT model for +modern Hebrew, outperforming existing models on most benchmarks. Additionally, +we release two fine-tuned versions of the model, designed to perform two +specific foundational tasks in the analysis of Hebrew texts: prefix +segmentation and morphological tagging. These fine-tuned models allow any +developer to perform prefix segmentation and morphological tagging of a Hebrew +sentence with a single call to a HuggingFace model, without the need to +integrate any additional libraries or code. In this paper we describe the +details of the training as well and the results on the different benchmarks. We +release the models to the community, along with sample code demonstrating their +use. We release these models as part of our goal to help further research and +development in Hebrew NLP. + +
+
+
+
+
+ + ☆ Developing a Scalable Benchmark for Assessing Large Language Models in + Knowledge Graph Engineering + + +
+ As the field of Large Language Models (LLMs) evolves at an accelerated pace, +the critical need to assess and monitor their performance emerges. We introduce +a benchmarking framework focused on knowledge graph engineering (KGE) +accompanied by three challenges addressing syntax and error correction, facts +extraction and dataset generation. We show that while being a useful tool, LLMs +are yet unfit to assist in knowledge graph generation with zero-shot prompting. +Consequently, our LLM-KG-Bench framework provides automatic evaluation and +storage of LLM responses as well as statistical data and visualization tools to +support tracking of prompt engineering and model performance. + +
+
+ comment: To be published in SEMANTICS 2023 poster track proceedings. SEMANTICS + 2023 EU: 19th International Conference on Semantic Systems, September 20-22, + 2023, Leipzig, Germany +
+
+
+
+
+ + ☆ Towards Spontaneous Style Modeling with Semi-supervised Pre-training for + Conversational Text-to-Speech Synthesis INTERSPEECH 2023 + + +
+ The spontaneous behavior that often occurs in conversations makes speech more +human-like compared to reading-style. However, synthesizing spontaneous-style +speech is challenging due to the lack of high-quality spontaneous datasets and +the high cost of labeling spontaneous behavior. In this paper, we propose a +semi-supervised pre-training method to increase the amount of spontaneous-style +speech and spontaneous behavioral labels. In the process of semi-supervised +learning, both text and speech information are considered for detecting +spontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is +used to model the relationship between each sentence in the conversation. +Experimental results indicate that our proposed method achieves superior +expressive speech synthesis performance with the ability to model spontaneous +behavior in spontaneous-style speech and predict reasonable spontaneous +behavior from text. + +
+
+ comment: Accepted by INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Interpreting Sentiment Composition with Latent Semantic Tree ACL2023 + + +
+ As the key to sentiment analysis, sentiment composition considers the +classification of a constituent via classifications of its contained +sub-constituents and rules operated on them. Such compositionality has been +widely studied previously in the form of hierarchical trees including untagged +and sentiment ones, which are intrinsically suboptimal in our view. To address +this, we propose semantic tree, a new tree form capable of interpreting the +sentiment composition in a principled way. Semantic tree is a derivation of a +context-free grammar (CFG) describing the specific composition rules on +difference semantic roles, which is designed carefully following previous +linguistic conclusions. However, semantic tree is a latent variable since there +is no its annotation in regular datasets. Thus, in our method, it is +marginalized out via inside algorithm and learned to optimize the +classification performance. Quantitative and qualitative results demonstrate +that our method not only achieves better or competitive results compared to +baselines in the setting of regular and domain adaptation classification, and +also generates plausible tree explanations. + +
+
+ comment: Findings of ACL2023 +
+
+
+
+
+ + ☆ Unsupervised Text Style Transfer with Deep Generative Models + + +
+ We present a general framework for unsupervised text style transfer with deep +generative models. The framework models each sentence-label pair in the +non-parallel corpus as partially observed from a complete quadruplet which +additionally contains two latent codes representing the content and style, +respectively. These codes are learned by exploiting dependencies inside the +observed data. Then a sentence is transferred by manipulating them. Our +framework is able to unify previous embedding and prototype methods as two +special forms. It also provides a principled perspective to explain previously +proposed techniques in the field such as aligned encoder and adversarial +training. We further conduct experiments on three benchmarks. Both automatic +and human evaluation results show that our methods achieve better or +competitive results compared to several strong baselines. + +
+
+
+
+
+ + ☆ Improving Mandarin Prosodic Structure Prediction with Multi-level + Contextual Information + + +
+ For text-to-speech (TTS) synthesis, prosodic structure prediction (PSP) plays +an important role in producing natural and intelligible speech. Although +inter-utterance linguistic information can influence the speech interpretation +of the target utterance, previous works on PSP mainly focus on utilizing +intrautterance linguistic information of the current utterance only. This work +proposes to use inter-utterance linguistic information to improve the +performance of PSP. Multi-level contextual information, which includes both +inter-utterance and intrautterance linguistic information, is extracted by a +hierarchical encoder from character level, utterance level and discourse level +of the input text. Then a multi-task learning (MTL) decoder predicts prosodic +boundaries from multi-level contextual information. Objective evaluation +results on two datasets show that our method achieves better F1 scores in +predicting prosodic word (PW), prosodic phrase (PPH) and intonational phrase +(IPH). It demonstrates the effectiveness of using multi-level contextual +information for PSP. Subjective preference tests also indicate the naturalness +of synthesized speeches are improved. + +
+
+ comment: Accepted by Interspeech2022 +
+
+
+
+
+ + ☆ Thesis Distillation: Investigating The Impact of Bias in NLP Models on + Hate Speech Detection + + +
+ This paper is a summary of the work in my PhD thesis. In which, I investigate +the impact of bias in NLP models on the task of hate speech detection from +three perspectives: explainability, offensive stereotyping bias, and fairness. +I discuss the main takeaways from my thesis and how they can benefit the +broader NLP community. Finally, I discuss important future research directions. +The findings of my thesis suggest that bias in NLP models impacts the task of +hate speech detection from all three perspectives. And that unless we start +incorporating social sciences in studying bias in NLP models, we will not +effectively overcome the current limitations of measuring and mitigating bias +in NLP models. + +
+
+
+
+
+ + ☆ Time-Varying Quasi-Closed-Phase Analysis for Accurate Formant Tracking + in Speech Signals + + +
+ In this paper, we propose a new method for the accurate estimation and +tracking of formants in speech signals using time-varying quasi-closed-phase +(TVQCP) analysis. Conventional formant tracking methods typically adopt a +two-stage estimate-and-track strategy wherein an initial set of formant +candidates are estimated using short-time analysis (e.g., 10--50 ms), followed +by a tracking stage based on dynamic programming or a linear state-space model. +One of the main disadvantages of these approaches is that the tracking stage, +however good it may be, cannot improve upon the formant estimation accuracy of +the first stage. The proposed TVQCP method provides a single-stage formant +tracking that combines the estimation and tracking stages into one. TVQCP +analysis combines three approaches to improve formant estimation and tracking: +(1) it uses temporally weighted quasi-closed-phase analysis to derive +closed-phase estimates of the vocal tract with reduced interference from the +excitation source, (2) it increases the residual sparsity by using the $L_1$ +optimization and (3) it uses time-varying linear prediction analysis over long +time windows (e.g., 100--200 ms) to impose a continuity constraint on the vocal +tract model and hence on the formant trajectories. Formant tracking experiments +with a wide variety of synthetic and natural speech signals show that the +proposed TVQCP method performs better than conventional and popular formant +tracking tools, such as Wavesurfer and Praat (based on dynamic programming), +the KARMA algorithm (based on Kalman filtering), and DeepFormants (based on +deep neural networks trained in a supervised manner). Matlab scripts for the +proposed method can be found at: https://github.com/njaygowda/ftrack + +
+
+
+
+
+ + ☆ The Smart Data Extractor, a Clinician Friendly Solution to Accelerate + and Improve the Data Collection During Clinical Trials + + +
+ In medical research, the traditional way to collect data, i.e. browsing +patient files, has been proven to induce bias, errors, human labor and costs. +We propose a semi-automated system able to extract every type of data, +including notes. The Smart Data Extractor pre-populates clinic research forms +by following rules. We performed a cross-testing experiment to compare +semi-automated to manual data collection. 20 target items had to be collected +for 79 patients. The average time to complete one form was 6'81'' for manual +data collection and 3'22'' with the Smart Data Extractor. There were also more +mistakes during manual data collection (163 for the whole cohort) than with the +Smart Data Extractor (46 for the whole cohort). We present an easy to use, +understandable and agile solution to fill out clinical research forms. It +reduces human effort and provides higher quality data, avoiding data re-entry +and fatigue induced errors. + +
+
+ comment: IOS Press, 2023, Studies in Health Technology and Informatics +
+
+
+
+
+ + ☆ Generalised Winograd Schema and its Contextuality + + +
+ Ambiguities in natural language give rise to probability distributions over +interpretations. The distributions are often over multiple ambiguous words at a +time; a multiplicity which makes them a suitable topic for sheaf-theoretic +models of quantum contextuality. Previous research showed that different +quantitative measures of contextuality correlate well with Psycholinguistic +research on lexical ambiguities. In this work, we focus on coreference +ambiguities and investigate the Winograd Schema Challenge (WSC), a test +proposed by Levesque in 2011 to evaluate the intelligence of machines. The WSC +consists of a collection of multiple-choice questions that require +disambiguating pronouns in sentences structured according to the Winograd +schema, in a way that makes it difficult for machines to determine the correct +referents but remains intuitive for human comprehension. In this study, we +propose an approach that analogously models the Winograd schema as an +experiment in quantum physics. However, we argue that the original Winograd +Schema is inherently too simplistic to facilitate contextuality. We introduce a +novel mechanism for generalising the schema, rendering it analogous to a +Bell-CHSH measurement scenario. We report an instance of this generalised +schema, complemented by the human judgements we gathered via a crowdsourcing +platform. The resulting model violates the Bell-CHSH inequality by 0.192, thus +exhibiting contextuality in a coreference resolution setting. + +
+
+ comment: In Proceedings QPL 2023, arXiv:2308.15489 +
+
+
+
+
+ + ☆ Transformer Compression via Subspace Projection + + +
+ We propose TCSP, a novel method for compressing a transformer model by +focusing on reducing the hidden size of the model. By projecting the whole +transform model into a subspace, we enable matrix operations between the weight +matrices in the model and features in a reduced-dimensional space, leading to +significant reductions in model parameters and computing resources. To +establish this subspace, we decompose the feature matrix, derived from +different layers of sampled data instances, into a projection matrix. For +evaluation, TCSP is applied to compress T5 and BERT models on the GLUE and +SQuAD benchmarks. Experimental results demonstrate that TCSP achieves a +compression ratio of 44\% with at most 1.6\% degradation in accuracy, +surpassing or matching prior compression methods. Furthermore, TCSP exhibits +compatibility with other methods targeting filter and attention head size +compression. + +
+
+ comment: 21 pages, 1 figures +
+
+
+
+
+ + ☆ Enhancing Subtask Performance of Multi-modal Large Language Model + + +
+ Multi-modal Large Language Model (MLLM) refers to a model expanded from a +Large Language Model (LLM) that possesses the capability to handle and infer +multi-modal data. Current MLLMs typically begin by using LLMs to decompose +tasks into multiple subtasks, then employing individual pre-trained models to +complete specific subtasks, and ultimately utilizing LLMs to integrate the +results of each subtasks to obtain the results of the task. In real-world +scenarios, when dealing with large projects, it is common practice to break +down the project into smaller sub-projects, with different teams providing +corresponding solutions or results. The project owner then decides which +solution or result to use, ensuring the best possible outcome for each subtask +and, consequently, for the entire project. Inspired by this, this study +considers selecting multiple pre-trained models to complete the same subtask. +By combining the results from multiple pre-trained models, the optimal subtask +result is obtained, enhancing the performance of the MLLM. Specifically, this +study first selects multiple pre-trained models focused on the same subtask +based on distinct evaluation approaches, and then invokes these models in +parallel to process input data and generate corresponding subtask results. +Finally, the results from multiple pre-trained models for the same subtask are +compared using the LLM, and the best result is chosen as the outcome for that +subtask. Extensive experiments are conducted in this study using GPT-4 +annotated datasets and human-annotated datasets. The results of various +evaluation metrics adequately demonstrate the effectiveness of the proposed +approach in this paper. + +
+
+
+
+
+ + ☆ Link Prediction for Wikipedia Articles as a Natural Language Inference + Task + + +
+ Link prediction task is vital to automatically understanding the structure of +large knowledge bases. In this paper, we present our system to solve this task +at the Data Science and Advanced Analytics 2023 Competition "Efficient and +Effective Link Prediction" (DSAA-2023 Competition) with a corpus containing +948,233 training and 238,265 for public testing. This paper introduces an +approach to link prediction in Wikipedia articles by formulating it as a +natural language inference (NLI) task. Drawing inspiration from recent +advancements in natural language processing and understanding, we cast link +prediction as an NLI task, wherein the presence of a link between two articles +is treated as a premise, and the task is to determine whether this premise +holds based on the information presented in the articles. We implemented our +system based on the Sentence Pair Classification for Link Prediction for the +Wikipedia Articles task. Our system achieved 0.99996 Macro F1-score and 1.00000 +Macro F1-score for the public and private test sets, respectively. Our team +UIT-NLP ranked 3rd in performance on the private test set, equal to the scores +of the first and second places. Our code is publicly for research purposes. + +
+
+ comment: Accepted at the 10th IEEE International Conference On Data Science + And Advanced Analytics (DSAA 2023) +
+
+
+
+
+ + ☆ Sparkles: Unlocking Chats Across Multiple Images for Multimodal + Instruction-Following Models + + +
+ Large language models exhibit enhanced zero-shot performance on various tasks +when fine-tuned with instruction-following data. Multimodal +instruction-following models extend these capabilities by integrating both text +and images. However, existing models such as MiniGPT-4 face challenges in +maintaining dialogue coherence in scenarios involving multiple images. A +primary reason is the lack of a specialized dataset for this critical +application. To bridge these gaps, we present SparklesChat, a multimodal +instruction-following model for open-ended dialogues across multiple images. To +support the training, we introduce SparklesDialogue, the first +machine-generated dialogue dataset tailored for word-level interleaved +multi-image and text interactions. Furthermore, we construct SparklesEval, a +GPT-assisted benchmark for quantitatively assessing a model's conversational +competence across multiple images and dialogue turns. Our experiments validate +the effectiveness of SparklesChat in understanding and reasoning across +multiple images and dialogue turns. Specifically, SparklesChat outperformed +MiniGPT-4 on established vision-and-language benchmarks, including the BISON +binary image selection task and the NLVR2 visual reasoning task. Moreover, +SparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding +MiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative +evaluations further demonstrate SparklesChat's generality in handling +real-world applications. All resources will be available at +https://github.com/HYPJUDY/Sparkles. + +
+
+
+
+
+ + ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained language models like ChatGPT have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks. Moreover, in bioinformatics, generating +functional programs poses additional notable challenges due to the amount of +domain knowledge, the need for complicated data operations, and intricate +functional dependencies between the operations. Here, we present BioCoder, a +benchmark developed to evaluate existing pre-trained models in generating +bioinformatics code. In relation to function-code generation, BioCoder covers +potential package dependencies, class declarations, and global variables. It +incorporates 1026 functions and 1243 methods in Python and Java from GitHub and +253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing +framework for evaluation, and we have applied it to evaluate many models +including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, +InstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes +the importance of domain knowledge, pragmatic code generation, and contextual +understanding. Our dataset, benchmark, Docker images, and scripts required for +testing are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ☆ Knowledge Distillation from Non-streaming to Streaming ASR Encoder using + Auxiliary Non-streaming Layer + + +
+ Streaming automatic speech recognition (ASR) models are restricted from +accessing future context, which results in worse performance compared to the +non-streaming models. To improve the performance of streaming ASR, knowledge +distillation (KD) from the non-streaming to streaming model has been studied, +mainly focusing on aligning the output token probabilities. In this paper, we +propose a layer-to-layer KD from the teacher encoder to the student encoder. To +ensure that features are extracted using the same context, we insert auxiliary +non-streaming branches to the student and perform KD from the non-streaming +teacher layer to the non-streaming auxiliary layer. We design a special KD loss +that leverages the autoregressive predictive coding (APC) mechanism to +encourage the streaming model to predict unseen future contexts. Experimental +results show that the proposed method can significantly reduce the word error +rate compared to previous token probability distillation methods. + +
+
+ comment: Accepted to Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Sensi-BERT: Towards Sensitivity Driven Fine-Tuning for + Parameter-Efficient BERT + + +
+ Large pre-trained language models have recently gained significant traction +due to their improved performance on various down-stream tasks like text +classification and question answering, requiring only few epochs of +fine-tuning. However, their large model sizes often prohibit their applications +on resource-constrained edge devices. Existing solutions of yielding +parameter-efficient BERT models largely rely on compute-exhaustive training and +fine-tuning. Moreover, they often rely on additional compute heavy models to +mitigate the performance gap. In this paper, we present Sensi-BERT, a +sensitivity driven efficient fine-tuning of BERT models that can take an +off-the-shelf pre-trained BERT model and yield highly parameter-efficient +models for downstream tasks. In particular, we perform sensitivity analysis to +rank each individual parameter tensor, that then is used to trim them +accordingly during fine-tuning for a given parameter or FLOPs budget. Our +experiments show the efficacy of Sensi-BERT across different downstream tasks +including MNLI, QQP, QNLI, SST-2 and SQuAD, showing better performance at +similar or smaller parameter budget compared to various alternatives. + +
+
+ comment: 6 pages, 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Deanthropomorphising NLP: Can a Language Model Be Conscious? + + +
+ This work is intended as a voice in the discussion over previous claims that +a pretrained large language model (LLM) based on the Transformer model +architecture can be sentient. Such claims have been made concerning the LaMDA +model and also concerning the current wave of LLM-powered chatbots, such as +ChatGPT. This claim, if confirmed, would have serious ramifications in the +Natural Language Processing (NLP) community due to wide-spread use of similar +models. However, here we take the position that such a large language model +cannot be sentient, or conscious, and that LaMDA in particular exhibits no +advances over other similar models that would qualify it. We justify this by +analysing the Transformer architecture through Integrated Information Theory of +consciousness. We see the claims of sentience as part of a wider tendency to +use anthropomorphic language in NLP reporting. Regardless of the veracity of +the claims, we consider this an opportune moment to take stock of progress in +language modelling and consider the ethical implications of the task. In order +to make this work helpful for readers outside the NLP community, we also +present the necessary background in language modelling. + +
+
+
+
+
+ + ♻ ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks, such as Reddit discussions. In contrast to traditional comment-only +methods, our approach to labelling a comment as hate speech involves a holistic +analysis of text and images grounded in the discussion context. This is done by +leveraging graph transformers to capture the contextual relationships in the +entire discussion surrounding a comment and grounding the interwoven fusion +layers that combine individual comments' text and image embeddings instead of +processing modalities separately. We compare the performance of our model to +baselines that only process individual comments and conduct extensive ablation +studies. To evaluate our work, we present a new dataset, HatefulDiscussions, +comprising complete multi-modal discussions from multiple online communities on +Reddit. We conclude with future work for multimodal solutions to deliver social +value in online contexts, arguing that capturing a holistic view of a +conversation significantly advances the effort to detect anti-social behaviour. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ♻ ☆ "It Felt Like Having a Second Mind": Investigating Human-AI + Co-creativity in Prewriting with Large Language Models + + +
+ Prewriting is the process of discovering and developing ideas before a first +draft, which requires divergent thinking and often implies unstructured +strategies such as diagramming, outlining, free-writing, etc. Although large +language models (LLMs) have been demonstrated to be useful for a variety of +tasks including creative writing, little is known about how users would +collaborate with LLMs to support prewriting. The preferred collaborative role +and initiative of LLMs during such a creativity process is also unclear. To +investigate human-LLM collaboration patterns and dynamics during prewriting, we +conducted a three-session qualitative study with 15 participants in two +creative tasks: story writing and slogan writing. The findings indicated that +during collaborative prewriting, there appears to be a three-stage iterative +Human-AI Co-creativity process that includes Ideation, Illumination, and +Implementation stages. This collaborative process champions the human in a +dominant role, in addition to mixed and shifting levels of initiative that +exist between humans and LLMs. This research also reports on collaboration +breakdowns that occur during this process, user perceptions of using existing +LLMs during Human-AI Co-creativity, and discusses design implications to +support this co-creativity process. + +
+
+ comment: Under Review; 25 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Playing with Words: Comparing the Vocabulary and Lexical Richness of + ChatGPT and Humans + + +
+ The introduction of Artificial Intelligence (AI) generative language models +such as GPT (Generative Pre-trained Transformer) and tools such as ChatGPT has +triggered a revolution that can transform how text is generated. This has many +implications, for example, as AI-generated text becomes a significant fraction +of the text, would this have an effect on the language capabilities of readers +and also on the training of newer AI tools? Would it affect the evolution of +languages? Focusing on one specific aspect of the language: words; will the use +of tools such as ChatGPT increase or reduce the vocabulary used or the lexical +richness? This has implications for words, as those not included in +AI-generated content will tend to be less and less popular and may eventually +be lost. In this work, we perform an initial comparison of the vocabulary and +lexical richness of ChatGPT and humans when performing the same tasks. In more +detail, two datasets containing the answers to different types of questions +answered by ChatGPT and humans, and a third dataset in which ChatGPT +paraphrases sentences and questions are used. The analysis shows that ChatGPT +tends to use fewer distinct words and lower lexical richness than humans. These +results are very preliminary and additional datasets and ChatGPT configurations +have to be evaluated to extract more general conclusions. Therefore, further +research is needed to understand how the use of ChatGPT and more broadly +generative AI tools will affect the vocabulary and lexical richness in +different types of text and languages. + +
+
+
+
+
+ + ♻ ☆ CARE-MI: Chinese Benchmark for Misinformation Evaluation in Maternity + and Infant Care + + +
+ The recent advances in natural language processing (NLP), have led to a new +trend of applying large language models (LLMs) to real-world scenarios. While +the latest LLMs are astonishingly fluent when interacting with humans, they +suffer from the misinformation problem by unintentionally generating factually +false statements. This can lead to harmful consequences, especially when +produced within sensitive contexts, such as healthcare. Yet few previous works +have focused on evaluating misinformation in the long-form (LF) generation of +LLMs, especially for knowledge-intensive topics. Moreover, although LLMs have +been shown to perform well in different languages, misinformation evaluation +has been mostly conducted in English. To this end, we present a benchmark, +CARE-MI, for evaluating LLM misinformation in: 1) a sensitive topic, +specifically the maternity and infant care domain; and 2) a language other than +English, namely Chinese. Most importantly, we provide an innovative paradigm +for building LF generation evaluation benchmarks that can be transferred to +other knowledge-intensive domains and low-resourced languages. Our proposed +benchmark fills the gap between the extensive usage of LLMs and the lack of +datasets for assessing the misinformation generated by these models. It +contains 1,612 expert-checked questions, accompanied with human-selected +references. Using our benchmark, we conduct extensive experiments and found +that current Chinese LLMs are far from perfect in the topic of maternity and +infant care. In an effort to minimize the reliance on human resources for +performance evaluation, we offer off-the-shelf judgment models for +automatically assessing the LF output of LLMs given benchmark questions. +Moreover, we compare potential solutions for LF generation evaluation and +provide insights for building better automated metrics. + +
+
+
+
+
+ + ♻ ☆ DocPrompt: Large-scale continue pretrain for zero-shot and few-shot + document question answering + + +
+ In this paper, we propose Docprompt for document question answering tasks +with powerful zero-shot and few-shot performance. We proposed a novel weakly +supervised data generation method, a novel multl-stage training method and a +novel understanding model \& generation model ensemble method. We achieved +state-of-the-art performance on 4 document question answering tasks. This +method greatly improves the delivery efficiency and model performance of +document question answering customer projects, reducing annotation costs and +labor costs. Our demo can be found at +https://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout. + +
+
+
+
+
+ + ♻ ☆ Exploring Large Language Models for Knowledge Graph Completion + + +
+ Knowledge graphs play a vital role in numerous artificial intelligence tasks, +yet they frequently face the issue of incompleteness. In this study, we explore +utilizing Large Language Models (LLM) for knowledge graph completion. We +consider triples in knowledge graphs as text sequences and introduce an +innovative framework called Knowledge Graph LLM (KG-LLM) to model these +triples. Our technique employs entity and relation descriptions of a triple as +prompts and utilizes the response for predictions. Experiments on various +benchmark knowledge graphs demonstrate that our method attains state-of-the-art +performance in tasks such as triple classification and relation prediction. We +also find that fine-tuning relatively smaller models (e.g., LLaMA-7B, +ChatGLM-6B) outperforms recent ChatGPT and GPT-4. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ OLISIA: a Cascade System for Spoken Dialogue State Tracking + + +
+ Though Dialogue State Tracking (DST) is a core component of spoken dialogue +systems, recent work on this task mostly deals with chat corpora, disregarding +the discrepancies between spoken and written language.In this paper, we propose +OLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR) +model and a DST model. We introduce several adaptations in the ASR and DST +modules to improve integration and robustness to spoken conversations.With +these adaptations, our system ranked first in DSTC11 Track 3, a benchmark to +evaluate spoken DST. We conduct an in-depth analysis of the results and find +that normalizing the ASR outputs and adapting the DST inputs through data +augmentation, along with increasing the pre-trained models size all play an +important role in reducing the performance discrepancy between written and +spoken conversations. + +
+
+
+
+
+ + ♻ ☆ Improving Non-autoregressive Translation Quality with Pretrained + Language Model, Embedding Distillation and Upsampling Strategy for CTC + + +
+ Non-autoregressive approaches aim to improve the inference speed of +translation models, particularly those that generate output in a one-pass +forward manner. However, these approaches often suffer from a significant drop +in translation quality compared to autoregressive models. This paper introduces +a series of innovative techniques to enhance the translation quality of +Non-Autoregressive Translation (NAT) models while maintaining a substantial +acceleration in inference speed. We propose fine-tuning Pretrained Multilingual +Language Models (PMLMs) with the CTC loss to train NAT models effectively. +Furthermore, we adopt the MASK insertion scheme for up-sampling instead of +token duplication, and we present an embedding distillation method to further +enhance performance. In our experiments, our model outperforms the baseline +autoregressive model (Transformer \textit{base}) on multiple datasets, +including WMT'14 DE$\leftrightarrow$EN, WMT'16 RO$\leftrightarrow$EN, and +IWSLT'14 DE$\leftrightarrow$EN. Notably, our model achieves better performance +than the baseline autoregressive model on the IWSLT'14 En$\leftrightarrow$De +and WMT'16 En$\leftrightarrow$Ro datasets, even without using distillation data +during training. It is worth highlighting that on the IWSLT'14 +DE$\rightarrow$EN dataset, our model achieves an impressive BLEU score of +39.59, setting a new state-of-the-art performance. Additionally, our model +exhibits a remarkable speed improvement of 16.35 times compared to the +autoregressive model. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue + Evaluation ACL2023 + + +
+ Existing reference-free turn-level evaluation metrics for chatbots +inadequately capture the interaction between the user and the system. +Consequently, they often correlate poorly with human evaluations. To address +this issue, we propose a novel model-agnostic approach that leverages +Conditional Pointwise Mutual Information (C-PMI) to measure the turn-level +interaction between the system and the user based on a given evaluation +dimension. Experimental results on the widely used FED dialogue evaluation +dataset demonstrate that our approach significantly improves the correlation +with human judgment compared with existing evaluation systems. By replacing the +negative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve +a relative 60.5% higher Spearman correlation on average for the FED evaluation +metric. Our code is publicly available at https://github.com/renll/C-PMI. + +
+
+ comment: Published at ACL2023 DialDoc Workshop; Updated Results +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 118 + +
+
+
+ + ☆ PointLLM: Empowering Large Language Models to Understand Point Clouds + + +
+ The unprecedented advancements in Large Language Models (LLMs) have created a +profound impact on natural language processing but are yet to fully embrace the +realm of 3D understanding. This paper introduces PointLLM, a preliminary effort +to fill this gap, thereby enabling LLMs to understand point clouds and offering +a new avenue beyond 2D visual data. PointLLM processes colored object point +clouds with human instructions and generates contextually appropriate +responses, illustrating its grasp of point clouds and common sense. +Specifically, it leverages a point cloud encoder with a powerful LLM to +effectively fuse geometric, appearance, and linguistic information. We collect +a novel dataset comprising 660K simple and 70K complex point-text instruction +pairs to enable a two-stage training strategy: initially aligning latent spaces +and subsequently instruction-tuning the unified model. To rigorously evaluate +our model's perceptual abilities and its generalization capabilities, we +establish two benchmarks: Generative 3D Object Classification and 3D Object +Captioning, assessed through three different methods, including human +evaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment +results show that PointLLM demonstrates superior performance over existing 2D +baselines. Remarkably, in human-evaluated object captioning tasks, PointLLM +outperforms human annotators in over 50% of the samples. Codes, datasets, and +benchmarks are available at https://github.com/OpenRobotLab/PointLLM . + +
+
+ comment: 19 pages. Empowering large language models with 3D point cloud + understanding, accompanied by a novel dataset and carefully designed + benchmarks. Project page: https://runsenxu.com/projects/PointLLM +
+
+
+
+
+ + ☆ StyleInV: A Temporal Style Modulated Inversion Network for Unconditional + Video Generation ICCV 2023 + + +
+ Unconditional video generation is a challenging task that involves +synthesizing high-quality videos that are both coherent and of extended +duration. To address this challenge, researchers have used pretrained StyleGAN +image generators for high-quality frame synthesis and focused on motion +generator design. The motion generator is trained in an autoregressive manner +using heavy 3D convolutional discriminators to ensure motion coherence during +video generation. In this paper, we introduce a novel motion generator design +that uses a learning-based inversion network for GAN. The encoder in our method +captures rich and smooth priors from encoding images to latents, and given the +latent of an initially generated frame as guidance, our method can generate +smooth future latent by modulating the inversion encoder temporally. Our method +enjoys the advantage of sparse training and naturally constrains the generation +space of our motion generator with the inversion network guided by the initial +frame, eliminating the need for heavy discriminators. Moreover, our method +supports style transfer with simple fine-tuning when the encoder is paired with +a pretrained StyleGAN generator. Extensive experiments conducted on various +benchmarks demonstrate the superiority of our method in generating long and +high-resolution videos with decent single-frame quality and temporal +consistency. + +
+
+ comment: ICCV 2023. Code: https://github.com/johannwyh/StyleInV Project page: + https://www.mmlab-ntu.com/project/styleinv/index.html +
+
+
+
+
+ + ☆ Fine-Grained Cross-View Geo-Localization Using a Correlation-Aware + Homography Estimator + + +
+ In this paper, we introduce a novel approach to fine-grained cross-view +geo-localization. Our method aligns a warped ground image with a corresponding +GPS-tagged satellite image covering the same area using homography estimation. +We first employ a differentiable spherical transform, adhering to geometric +principles, to accurately align the perspective of the ground image with the +satellite map. This transformation effectively places ground and aerial images +in the same view and on the same plane, reducing the task to an image alignment +problem. To address challenges such as occlusion, small overlapping range, and +seasonal variations, we propose a robust correlation-aware homography estimator +to align similar parts of the transformed ground image with the satellite +image. Our method achieves sub-pixel resolution and meter-level GPS accuracy by +mapping the center point of the transformed ground image to the satellite image +using a homography matrix and determining the orientation of the ground camera +using a point above the central axis. Operating at a speed of 30 FPS, our +method outperforms state-of-the-art techniques, reducing the mean metric +localization error by 21.3% and 32.4% in same-area and cross-area +generalization tasks on the VIGOR benchmark, respectively, and by 34.4% on the +KITTI benchmark in same-area evaluation. + +
+
+ comment: 19 pages. Reducing the cross-view geo-localization problem to a 2D + image alignment problem by utilizing BEV transformation, and completing the + alignment process with a correlation-aware homography estimator. Code: + https://github.com/xlwangDev/HC-Net +
+
+
+
+
+ + ☆ InterDiff: Generating 3D Human-Object Interactions with Physics-Informed + Diffusion ICCV 2023 + + +
+ This paper addresses a novel task of anticipating 3D human-object +interactions (HOIs). Most existing research on HOI synthesis lacks +comprehensive whole-body interactions with dynamic objects, e.g., often limited +to manipulating small or static objects. Our task is significantly more +challenging, as it requires modeling dynamic objects with various shapes, +capturing whole-body motion, and ensuring physically valid interactions. To +this end, we propose InterDiff, a framework comprising two key steps: (i) +interaction diffusion, where we leverage a diffusion model to encode the +distribution of future human-object interactions; (ii) interaction correction, +where we introduce a physics-informed predictor to correct denoised HOIs in a +diffusion step. Our key insight is to inject prior knowledge that the +interactions under reference with respect to contact points follow a simple +pattern and are easily predictable. Experiments on multiple human-object +interaction datasets demonstrate the effectiveness of our method for this task, +capable of producing realistic, vivid, and remarkably long-term 3D HOI +predictions. + +
+
+ comment: ICCV 2023; Project Page: https://sirui-xu.github.io/InterDiff/ +
+
+
+
+
+ + ☆ PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic + Occupancy Prediction + + +
+ Semantic segmentation in autonomous driving has been undergoing an evolution +from sparse point segmentation to dense voxel segmentation, where the objective +is to predict the semantic occupancy of each voxel in the concerned 3D space. +The dense nature of the prediction space has rendered existing efficient +2D-projection-based methods (e.g., bird's eye view, range view, etc.) +ineffective, as they can only describe a subspace of the 3D scene. To address +this, we propose a cylindrical tri-perspective view to represent point clouds +effectively and comprehensively and a PointOcc model to process them +efficiently. Considering the distance distribution of LiDAR point clouds, we +construct the tri-perspective view in the cylindrical coordinate system for +more fine-grained modeling of nearer areas. We employ spatial group pooling to +maintain structural details during projection and adopt 2D backbones to +efficiently process each TPV plane. Finally, we obtain the features of each +point by aggregating its projected features on each of the processed TPV planes +without the need for any post-processing. Extensive experiments on both 3D +occupancy prediction and LiDAR segmentation benchmarks demonstrate that the +proposed PointOcc achieves state-of-the-art performance with much faster speed. +Specifically, despite only using LiDAR, PointOcc significantly outperforms all +other methods, including multi-modal methods, with a large margin on the +OpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc. + +
+
+ comment: Code is available at https://github.com/wzzheng/PointOcc +
+
+
+
+
+ + ☆ EMDB: The Electromagnetic Database of Global 3D Human Pose and Shape in + the Wild ICCV 2023 + + +
+ We present EMDB, the Electromagnetic Database of Global 3D Human Pose and +Shape in the Wild. EMDB is a novel dataset that contains high-quality 3D SMPL +pose and shape parameters with global body and camera trajectories for +in-the-wild videos. We use body-worn, wireless electromagnetic (EM) sensors and +a hand-held iPhone to record a total of 58 minutes of motion data, distributed +over 81 indoor and outdoor sequences and 10 participants. Together with +accurate body poses and shapes, we also provide global camera poses and body +root trajectories. To construct EMDB, we propose a multi-stage optimization +procedure, which first fits SMPL to the 6-DoF EM measurements and then refines +the poses via image observations. To achieve high-quality results, we leverage +a neural implicit avatar model to reconstruct detailed human surface geometry +and appearance, which allows for improved alignment and smoothness via a dense +pixel-level objective. Our evaluations, conducted with a multi-view volumetric +capture system, indicate that EMDB has an expected accuracy of 2.3 cm +positional and 10.6 degrees angular error, surpassing the accuracy of previous +in-the-wild datasets. We evaluate existing state-of-the-art monocular RGB +methods for camera-relative and global pose estimation on EMDB. EMDB is +publicly available under https://ait.ethz.ch/emdb + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Language-Conditioned Path Planning + + +
+ Contact is at the core of robotic manipulation. At times, it is desired (e.g. +manipulation and grasping), and at times, it is harmful (e.g. when avoiding +obstacles). However, traditional path planning algorithms focus solely on +collision-free paths, limiting their applicability in contact-rich tasks. To +address this limitation, we propose the domain of Language-Conditioned Path +Planning, where contact-awareness is incorporated into the path planning +problem. As a first step in this domain, we propose Language-Conditioned +Collision Functions (LACO) a novel approach that learns a collision function +using only a single-view image, language prompt, and robot configuration. LACO +predicts collisions between the robot and the environment, enabling flexible, +conditional path planning without the need for manual object annotations, point +cloud data, or ground-truth object meshes. In both simulation and the real +world, we demonstrate that LACO can facilitate complex, nuanced path plans that +allow for interaction with objects that are safe to collide, rather than +prohibiting any collision. + +
+
+ comment: Conference on Robot Learning, 2023 +
+
+
+
+
+ + ☆ GNFactor: Multi-Task Real Robot Learning with Generalizable Neural + Feature Fields + + +
+ It is a long-standing problem in robotics to develop agents capable of +executing diverse manipulation tasks from visual observations in unstructured +real-world environments. To achieve this goal, the robot needs to have a +comprehensive understanding of the 3D structure and semantics of the scene. In +this work, we present $\textbf{GNFactor}$, a visual behavior cloning agent for +multi-task robotic manipulation with $\textbf{G}$eneralizable $\textbf{N}$eural +feature $\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural +field (GNF) as a reconstruction module and a Perceiver Transformer as a +decision-making module, leveraging a shared deep 3D voxel representation. To +incorporate semantics in 3D, the reconstruction module utilizes a +vision-language foundation model ($\textit{e.g.}$, Stable Diffusion) to distill +rich semantic information into the deep 3D voxel. We evaluate GNFactor on 3 +real robot tasks and perform detailed ablations on 10 RLBench tasks with a +limited number of demonstrations. We observe a substantial improvement of +GNFactor over current state-of-the-art methods in seen and unseen tasks, +demonstrating the strong generalization ability of GNFactor. Our project +website is https://yanjieze.com/GNFactor/ . + +
+
+ comment: CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/ +
+
+
+
+
+ + ☆ TouchStone: Evaluating Vision-Language Models by Language Models + + +
+ Large vision-language models (LVLMs) have recently witnessed rapid +advancements, exhibiting a remarkable capacity for perceiving, understanding, +and processing visual information by connecting visual receptor with large +language models (LLMs). However, current assessments mainly focus on +recognizing and reasoning abilities, lacking direct evaluation of +conversational skills and neglecting visual storytelling abilities. In this +paper, we propose an evaluation method that uses strong LLMs as judges to +comprehensively evaluate the various abilities of LVLMs. Firstly, we construct +a comprehensive visual dialogue dataset TouchStone, consisting of open-world +images and questions, covering five major categories of abilities and 27 +subtasks. This dataset not only covers fundamental recognition and +comprehension but also extends to literary creation. Secondly, by integrating +detailed image annotations we effectively transform the multimodal input +content into a form understandable by LLMs. This enables us to employ advanced +LLMs for directly evaluating the quality of the multimodal dialogue without +requiring human intervention. Through validation, we demonstrate that powerful +LVLMs, such as GPT-4, can effectively score dialogue quality by leveraging +their textual capabilities alone, aligning with human preferences. We hope our +work can serve as a touchstone for LVLMs' evaluation and pave the way for +building stronger LVLMs. The evaluation code is available at +https://github.com/OFA-Sys/TouchStone. + +
+
+ comment: https://github.com/OFA-Sys/TouchStone +
+
+
+
+
+ + ☆ Text2Scene: Text-driven Indoor Scene Stylization with Part-aware Details CVPR 2023 + + +
+ We propose Text2Scene, a method to automatically create realistic textures +for virtual scenes composed of multiple objects. Guided by a reference image +and text descriptions, our pipeline adds detailed texture on labeled 3D +geometries in the room such that the generated colors respect the hierarchical +structure or semantic parts that are often composed of similar materials. +Instead of applying flat stylization on the entire scene at a single step, we +obtain weak semantic cues from geometric segmentation, which are further +clarified by assigning initial colors to segmented parts. Then we add texture +details for individual objects such that their projections on image space +exhibit feature embedding aligned with the embedding of the input. The +decomposition makes the entire pipeline tractable to a moderate amount of +computation resources and memory. As our framework utilizes the existing +resources of image and text embedding, it does not require dedicated datasets +with high-quality textures designed by skillful artists. To the best of our +knowledge, it is the first practical and scalable approach that can create +detailed and realistic textures of the desired style that maintain structural +context for scenes with multiple objects. + +
+
+ comment: Accepted to CVPR 2023 +
+
+
+
+
+ + ☆ SportsSloMo: A New Benchmark and Baselines for Human-centric Video Frame + Interpolation + + +
+ Human-centric video frame interpolation has great potential for improving +people's entertainment experiences and finding commercial applications in the +sports analysis industry, e.g., synthesizing slow-motion videos. Although there +are multiple benchmark datasets available in the community, none of them is +dedicated for human-centric scenarios. To bridge this gap, we introduce +SportsSloMo, a benchmark consisting of more than 130K video clips and 1M video +frames of high-resolution ($\geq$720p) slow-motion sports videos crawled from +YouTube. We re-train several state-of-the-art methods on our benchmark, and the +results show a decrease in their accuracy compared to other datasets. It +highlights the difficulty of our benchmark and suggests that it poses +significant challenges even for the best-performing methods, as human bodies +are highly deformable and occlusions are frequent in sports videos. To improve +the accuracy, we introduce two loss terms considering the human-aware priors, +where we add auxiliary supervision to panoptic segmentation and human keypoints +detection, respectively. The loss terms are model agnostic and can be easily +plugged into any video frame interpolation approaches. Experimental results +validate the effectiveness of our proposed loss terms, leading to consistent +performance improvement over 5 existing models, which establish strong baseline +models on our benchmark. The dataset and code can be found at: +https://neu-vi.github.io/SportsSlomo/. + +
+
+ comment: Project Page: https://neu-vi.github.io/SportsSlomo/ +
+
+
+
+
+ + ☆ Holistic Processing of Colour Images Using Novel Quaternion-Valued + Wavelets on the Plane + + +
+ We investigate the applicability of quaternion-valued wavelets on the plane +to holistic colour image processing. We present a methodology for decomposing +and reconstructing colour images using quaternionic wavelet filters associated +to recently developed quaternion-valued wavelets on the plane. We consider +compression, enhancement, segmentation, and denoising techniques to demonstrate +quaternion-valued wavelets as a promising tool for holistic colour image +processing. + +
+
+
+
+
+ + ☆ Self-pruning Graph Neural Network for Predicting Inflammatory Disease + Activity in Multiple Sclerosis from Brain MR Images + + +
+ Multiple Sclerosis (MS) is a severe neurological disease characterized by +inflammatory lesions in the central nervous system. Hence, predicting +inflammatory disease activity is crucial for disease assessment and treatment. +However, MS lesions can occur throughout the brain and vary in shape, size and +total count among patients. The high variance in lesion load and locations +makes it challenging for machine learning methods to learn a globally effective +representation of whole-brain MRI scans to assess and predict disease. +Technically it is non-trivial to incorporate essential biomarkers such as +lesion load or spatial proximity. Our work represents the first attempt to +utilize graph neural networks (GNN) to aggregate these biomarkers for a novel +global representation. We propose a two-stage MS inflammatory disease activity +prediction approach. First, a 3D segmentation network detects lesions, and a +self-supervised algorithm extracts their image features. Second, the detected +lesions are used to build a patient graph. The lesions act as nodes in the +graph and are initialized with image features extracted in the first stage. +Finally, the lesions are connected based on their spatial proximity and the +inflammatory disease activity prediction is formulated as a graph +classification task. Furthermore, we propose a self-pruning strategy to +auto-select the most critical lesions for prediction. Our proposed method +outperforms the existing baseline by a large margin (AUCs of 0.67 vs. 0.61 and +0.66 vs. 0.60 for one-year and two-year inflammatory disease activity, +respectively). Finally, our proposed method enjoys inherent explainability by +assigning an importance score to each lesion for the overall prediction. Code +is available at https://github.com/chinmay5/ms_ida.git + +
+
+
+
+
+ + ☆ Diffusion Models for Interferometric Satellite Aperture Radar + + +
+ Probabilistic Diffusion Models (PDMs) have recently emerged as a very +promising class of generative models, achieving high performance in natural +image generation. However, their performance relative to non-natural images, +like radar-based satellite data, remains largely unknown. Generating large +amounts of synthetic (and especially labelled) satellite data is crucial to +implement deep-learning approaches for the processing and analysis of +(interferometric) satellite aperture radar data. Here, we leverage PDMs to +generate several radar-based satellite image datasets. We show that PDMs +succeed in generating images with complex and realistic structures, but that +sampling time remains an issue. Indeed, accelerated sampling strategies, which +work well on simple image datasets like MNIST, fail on our radar datasets. We +provide a simple and versatile open-source +https://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and +evaluate PDMs using any dataset on a single GPU. + +
+
+
+
+
+ + ☆ Coarse-to-Fine Amodal Segmentation with Shape Prior ICCV 2023 + + +
+ Amodal object segmentation is a challenging task that involves segmenting +both visible and occluded parts of an object. In this paper, we propose a novel +approach, called Coarse-to-Fine Segmentation (C2F-Seg), that addresses this +problem by progressively modeling the amodal segmentation. C2F-Seg initially +reduces the learning space from the pixel-level image space to the +vector-quantized latent space. This enables us to better handle long-range +dependencies and learn a coarse-grained amodal segment from visual features and +visible segments. However, this latent space lacks detailed information about +the object, which makes it difficult to provide a precise segmentation +directly. To address this issue, we propose a convolution refine module to +inject fine-grained information and provide a more precise amodal object +segmentation based on visual features and coarse-predicted segmentation. To +help the studies of amodal object segmentation, we create a synthetic amodal +dataset, named as MOViD-Amodal (MOViD-A), which can be used for both image and +video amodal object segmentation. We extensively evaluate our model on two +benchmark datasets: KINS and COCO-A. Our empirical results demonstrate the +superiority of C2F-Seg. Moreover, we exhibit the potential of our approach for +video amodal object segmentation tasks on FISHBOWL and our proposed MOViD-A. +Project page at: http://jianxgao.github.io/C2F-Seg. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ BTSeg: Barlow Twins Regularization for Domain Adaptation in Semantic + Segmentation + + +
+ Semantic image segmentation is a critical component in many computer vision +systems, such as autonomous driving. In such applications, adverse conditions +(heavy rain, night time, snow, extreme lighting) on the one hand pose specific +challenges, yet are typically underrepresented in the available datasets. +Generating more training data is cumbersome and expensive, and the process +itself is error-prone due to the inherent aleatoric uncertainty. To address +this challenging problem, we propose BTSeg, which exploits image-level +correspondences as weak supervision signal to learn a segmentation model that +is agnostic to adverse conditions. To this end, our approach uses the Barlow +twins loss from the field of unsupervised learning and treats images taken at +the same location but under different adverse conditions as "augmentations" of +the same unknown underlying base image. This allows the training of a +segmentation model that is robust to appearance changes introduced by different +adverse conditions. We evaluate our approach on ACDC and the new challenging +ACG benchmark to demonstrate its robustness and generalization capabilities. +Our approach performs favorably when compared to the current state-of-the-art +methods, while also being simpler to implement and train. The code will be +released upon acceptance. + +
+
+
+
+
+ + ☆ Multiscale Residual Learning of Graph Convolutional Sequence Chunks for + Human Motion Prediction + + +
+ A new method is proposed for human motion prediction by learning temporal and +spatial dependencies. Recently, multiscale graphs have been developed to model +the human body at higher abstraction levels, resulting in more stable motion +prediction. Current methods however predetermine scale levels and combine +spatially proximal joints to generate coarser scales based on human priors, +even though movement patterns in different motion sequences vary and do not +fully comply with a fixed graph of spatially connected joints. Another problem +with graph convolutional methods is mode collapse, in which predicted poses +converge around a mean pose with no discernible movements, particularly in +long-term predictions. To tackle these issues, we propose ResChunk, an +end-to-end network which explores dynamically correlated body components based +on the pairwise relationships between all joints in individual sequences. +ResChunk is trained to learn the residuals between target sequence chunks in an +autoregressive manner to enforce the temporal connectivities between +consecutive chunks. It is hence a sequence-to-sequence prediction network which +considers dynamic spatio-temporal features of sequences at multiple levels. Our +experiments on two challenging benchmark datasets, CMU Mocap and Human3.6M, +demonstrate that our proposed method is able to effectively model the sequence +information for motion prediction and outperform other techniques to set a new +state-of-the-art. Our code is available at +https://github.com/MohsenZand/ResChunk. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models + + +
+ Zero-shot referring image segmentation is a challenging task because it aims +to find an instance segmentation mask based on the given referring +descriptions, without training on this type of paired data. Current zero-shot +methods mainly focus on using pre-trained discriminative models (e.g., CLIP). +However, we have observed that generative models (e.g., Stable Diffusion) have +potentially understood the relationships between various visual elements and +text descriptions, which are rarely investigated in this task. In this work, we +introduce a novel Referring Diffusional segmentor (Ref-Diff) for this task, +which leverages the fine-grained multi-modal information from generative +models. We demonstrate that without a proposal generator, a generative model +alone can achieve comparable performance to existing SOTA weakly-supervised +models. When we combine both generative and discriminative models, our Ref-Diff +outperforms these competing methods by a significant margin. This indicates +that generative models are also beneficial for this task and can complement +discriminative models for better referring segmentation. Our code is publicly +available at https://github.com/kodenii/Ref-Diff. + +
+
+
+
+
+ + ☆ Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation + Using only Images ICCV 2023 + + +
+ Generating 3D faces from textual descriptions has a multitude of +applications, such as gaming, movie, and robotics. Recent progresses have +demonstrated the success of unconditional 3D face generation and text-to-3D +shape generation. However, due to the limited text-3D face data pairs, +text-driven 3D face generation remains an open problem. In this paper, we +propose a text-guided 3D faces generation method, refer as TG-3DFace, for +generating realistic 3D faces using text guidance. Specifically, we adopt an +unconditional 3D face generation framework and equip it with text conditions, +which learns the text-guided 3D face generation with only text-2D face data. On +top of that, we propose two text-to-face cross-modal alignment techniques, +including the global contrastive learning and the fine-grained alignment +module, to facilitate high semantic consistency between generated 3D faces and +input texts. Besides, we present directional classifier guidance during the +inference process, which encourages creativity for out-of-domain generations. +Compared to the existing methods, TG-3DFace creates more realistic and +aesthetically pleasing 3D faces, boosting 9% multi-view consistency (MVIC) over +Latent3D. The rendered face images generated by TG-3DFace achieve higher FID +and CLIP score than text-to-2D face/image generation models, demonstrating our +superiority in generating realistic and semantic-consistent textures. + +
+
+ comment: accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Unsupervised CT Metal Artifact Reduction by Plugging Diffusion Priors in + Dual Domains + + +
+ During the process of computed tomography (CT), metallic implants often cause +disruptive artifacts in the reconstructed images, impeding accurate diagnosis. +Several supervised deep learning-based approaches have been proposed for +reducing metal artifacts (MAR). However, these methods heavily rely on training +with simulated data, as obtaining paired metal artifact CT and clean CT data in +clinical settings is challenging. This limitation can lead to decreased +performance when applying these methods in clinical practice. Existing +unsupervised MAR methods, whether based on learning or not, typically operate +within a single domain, either in the image domain or the sinogram domain. In +this paper, we propose an unsupervised MAR method based on the diffusion model, +a generative model with a high capacity to represent data distributions. +Specifically, we first train a diffusion model using CT images without metal +artifacts. Subsequently, we iteratively utilize the priors embedded within the +pre-trained diffusion model in both the sinogram and image domains to restore +the degraded portions caused by metal artifacts. This dual-domain processing +empowers our approach to outperform existing unsupervised MAR methods, +including another MAR method based on the diffusion model, which we have +qualitatively and quantitatively validated using synthetic datasets. Moreover, +our method demonstrates superior visual results compared to both supervised and +unsupervised methods on clinical datasets. + +
+
+
+
+
+ + ☆ Socratis: Are large multimodal models emotionally aware? ICCV 2023 + + +
+ Existing emotion prediction benchmarks contain coarse emotion labels which do +not consider the diversity of emotions that an image and text can elicit in +humans due to various reasons. Learning diverse reactions to multimodal content +is important as intelligent machines take a central role in generating and +delivering content to society. To address this gap, we propose Socratis, a +\underline{soc}ietal \underline{r}e\underline{a}c\underline{ti}on\underline{s} +benchmark, where each image-caption (IC) pair is annotated with multiple +emotions and the reasons for feeling them. Socratis contains 18K free-form +reactions for 980 emotions on 2075 image-caption pairs from 5 widely-read news +and image-caption (IC) datasets. We benchmark the capability of +state-of-the-art multimodal large language models to generate the reasons for +feeling an emotion given an IC pair. Based on a preliminary human study, we +observe that humans prefer human-written reasons over 2 times more often than +machine-generated ones. This shows our task is harder than standard generation +tasks because it starkly contrasts recent findings where humans cannot tell +apart machine vs human-written news articles, for instance. We further see that +current captioning metrics based on large vision-language models also fail to +correlate with human preferences. We hope that these findings and our benchmark +will inspire further research on training emotionally aware models. + +
+
+ comment: ICCV 2023 WECIA +
+
+
+
+
+ + ☆ Parsing is All You Need for Accurate Gait Recognition in the Wild ACM MM 2023 + + +
+ Binary silhouettes and keypoint-based skeletons have dominated human gait +recognition studies for decades since they are easy to extract from video +frames. Despite their success in gait recognition for in-the-lab environments, +they usually fail in real-world scenarios due to their low information entropy +for gait representations. To achieve accurate gait recognition in the wild, +this paper presents a novel gait representation, named Gait Parsing Sequence +(GPS). GPSs are sequences of fine-grained human segmentation, i.e., human +parsing, extracted from video frames, so they have much higher information +entropy to encode the shapes and dynamics of fine-grained human parts during +walking. Moreover, to effectively explore the capability of the GPS +representation, we propose a novel human parsing-based gait recognition +framework, named ParsingGait. ParsingGait contains a Convolutional Neural +Network (CNN)-based backbone and two light-weighted heads. The first head +extracts global semantic features from GPSs, while the other one learns mutual +information of part-level features through Graph Convolutional Networks to +model the detailed dynamics of human walking. Furthermore, due to the lack of +suitable datasets, we build the first parsing-based dataset for gait +recognition in the wild, named Gait3D-Parsing, by extending the large-scale and +challenging Gait3D dataset. Based on Gait3D-Parsing, we comprehensively +evaluate our method and existing gait recognition methods. The experimental +results show a significant improvement in accuracy brought by the GPS +representation and the superiority of ParsingGait. The code and dataset are +available at https://gait3d.github.io/gait3d-parsing-hp . + +
+
+ comment: 16 pages, 14 figures, ACM MM 2023 accepted, project page: + https://gait3d.github.io/gait3d-parsing-hp +
+
+
+
+
+ + ☆ US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for + Cervical Lymph Node Lesions Diagnoses in Ultrasound Images + + +
+ Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph +node lesions. However, the diagnoses of these images largely hinge on the +expertise of medical practitioners, rendering the process susceptible to +misdiagnoses. Although rapidly developing deep learning has substantially +improved the diagnoses of diverse ultrasound images, there remains a +conspicuous research gap concerning cervical lymph nodes. The objective of our +work is to accurately diagnose cervical lymph node lesions by leveraging a deep +learning model. To this end, we first collected 3392 images containing normal +lymph nodes, benign lymph node lesions, malignant primary lymph node lesions, +and malignant metastatic lymph node lesions. Given that ultrasound images are +generated by the reflection and scattering of sound waves across varied bodily +tissues, we proposed the Conv-FFT Block. It integrates convolutional operations +with the fast Fourier transform to more astutely model the images. Building +upon this foundation, we designed a novel architecture, named US-SFNet. This +architecture not only discerns variances in ultrasound images from the spatial +domain but also adeptly captures microstructural alterations across various +lesions in the frequency domain. To ascertain the potential of US-SFNet, we +benchmarked it against 12 popular architectures through five-fold +cross-validation. The results show that US-SFNet is SOTA and can achieve 92.89% +accuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity, +respectively. + +
+
+
+
+
+ + ☆ Post-Deployment Adaptation with Access to Source Data via Federated + Learning and Source-Target Remote Gradient Alignment MICCAI 2023 + + +
+ Deployment of Deep Neural Networks in medical imaging is hindered by +distribution shift between training data and data processed after deployment, +causing performance degradation. Post-Deployment Adaptation (PDA) addresses +this by tailoring a pre-trained, deployed model to the target data distribution +using limited labelled or entirely unlabelled target data, while assuming no +access to source training data as they cannot be deployed with the model due to +privacy concerns and their large size. This makes reliable adaptation +challenging due to limited learning signal. This paper challenges this +assumption and introduces FedPDA, a novel adaptation framework that brings the +utility of learning from remote data from Federated Learning into PDA. FedPDA +enables a deployed model to obtain information from source data via remote +gradient exchange, while aiming to optimize the model specifically for the +target domain. Tailored for FedPDA, we introduce a novel optimization method +StarAlign (Source-Target Remote Gradient Alignment) that aligns gradients +between source-target domain pairs by maximizing their inner product, to +facilitate learning a target-specific model. We demonstrate the method's +effectiveness using multi-center databases for the tasks of cancer metastases +detection and skin lesion classification, where our method compares favourably +to previous work. Code is available at: https://github.com/FelixWag/StarAlign + +
+
+ comment: This version was accepted for the Machine Learning in Medical Imaging + (MLMI 2023) workshop at MICCAI 2023 +
+
+
+
+
+ + ☆ Terrain Diffusion Network: Climatic-Aware Terrain Generation with + Geological Sketch Guidance + + +
+ Sketch-based terrain generation seeks to create realistic landscapes for +virtual environments in various applications such as computer games, animation +and virtual reality. Recently, deep learning based terrain generation has +emerged, notably the ones based on generative adversarial networks (GAN). +However, these methods often struggle to fulfill the requirements of flexible +user control and maintain generative diversity for realistic terrain. +Therefore, we propose a novel diffusion-based method, namely terrain diffusion +network (TDN), which actively incorporates user guidance for enhanced +controllability, taking into account terrain features like rivers, ridges, +basins, and peaks. Instead of adhering to a conventional monolithic denoising +process, which often compromises the fidelity of terrain details or the +alignment with user control, a multi-level denoising scheme is proposed to +generate more realistic terrains by taking into account fine-grained details, +particularly those related to climatic patterns influenced by erosion and +tectonic activities. Specifically, three terrain synthesisers are designed for +structural, intermediate, and fine-grained level denoising purposes, which +allow each synthesiser concentrate on a distinct terrain aspect. Moreover, to +maximise the efficiency of our TDN, we further introduce terrain and sketch +latent spaces for the synthesizers with pre-trained terrain autoencoders. +Comprehensive experiments on a new dataset constructed from NASA Topology +Images clearly demonstrate the effectiveness of our proposed method, achieving +the state-of-the-art performance. Our code and dataset will be publicly +available. + +
+
+
+
+
+ + ☆ Towards Vehicle-to-everything Autonomous Driving: A Survey on + Collaborative Perception + + +
+ Vehicle-to-everything (V2X) autonomous driving opens up a promising direction +for developing a new generation of intelligent transportation systems. +Collaborative perception (CP) as an essential component to achieve V2X can +overcome the inherent limitations of individual perception, including occlusion +and long-range perception. In this survey, we provide a comprehensive review of +CP methods for V2X scenarios, bringing a profound and in-depth understanding to +the community. Specifically, we first introduce the architecture and workflow +of typical V2X systems, which affords a broader perspective to understand the +entire V2X system and the role of CP within it. Then, we thoroughly summarize +and analyze existing V2X perception datasets and CP methods. Particularly, we +introduce numerous CP methods from various crucial perspectives, including +collaboration stages, roadside sensors placement, latency compensation, +performance-bandwidth trade-off, attack/defense, pose alignment, etc. Moreover, +we conduct extensive experimental analyses to compare and examine current CP +methods, revealing some essential and unexplored insights. Specifically, we +analyze the performance changes of different methods under different +bandwidths, providing a deep insight into the performance-bandwidth trade-off +issue. Also, we examine methods under different LiDAR ranges. To study the +model robustness, we further investigate the effects of various simulated +real-world noises on the performance of different CP methods, covering +communication latency, lossy communication, localization errors, and mixed +noises. In addition, we look into the sim-to-real generalization ability of +existing CP methods. At last, we thoroughly discuss issues and challenges, +highlighting promising directions for future efforts. Our codes for +experimental analysis will be public at +https://github.com/memberRE/Collaborative-Perception. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ ViLTA: Enhancing Vision-Language Pre-training through Textual + Augmentation + + +
+ Vision-language pre-training (VLP) methods are blossoming recently, and its +crucial goal is to jointly learn visual and textual features via a +transformer-based architecture, demonstrating promising improvements on a +variety of vision-language tasks. Prior arts usually focus on how to align +visual and textual features, but strategies for improving the robustness of +model and speeding up model convergence are left insufficiently explored. + In this paper, we propose a novel method ViLTA, comprising of two components +to further facilitate the model to learn fine-grained representations among +image-text pairs. For Masked Language Modeling (MLM), we propose a +cross-distillation method to generate soft labels to enhance the robustness of +model, which alleviates the problem of treating synonyms of masked words as +negative samples in one-hot labels. For Image-Text Matching (ITM), we leverage +the current language encoder to synthesize hard negatives based on the context +of language input, encouraging the model to learn high-quality representations +by increasing the difficulty of the ITM task. By leveraging the above +techniques, our ViLTA can achieve better performance on various vision-language +tasks. Extensive experiments on benchmark datasets demonstrate that the +effectiveness of ViLTA and its promising potential for vision-language +pre-training. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor + Attack + + +
+ The vulnerabilities to backdoor attacks have recently threatened the +trustworthiness of machine learning models in practical applications. +Conventional wisdom suggests that not everyone can be an attacker since the +process of designing the trigger generation algorithm often involves +significant effort and extensive experimentation to ensure the attack's +stealthiness and effectiveness. Alternatively, this paper shows that there +exists a more severe backdoor threat: anyone can exploit an easily-accessible +algorithm for silent backdoor attacks. Specifically, this attacker can employ +the widely-used lossy image compression from a plethora of compression tools to +effortlessly inject a trigger pattern into an image without leaving any +noticeable trace; i.e., the generated triggers are natural artifacts. One does +not require extensive knowledge to click on the "convert" or "save as" button +while using tools for lossy image compression. Via this attack, the adversary +does not need to design a trigger generator as seen in prior works and only +requires poisoning the data. Empirically, the proposed attack consistently +achieves 100% attack success rate in several benchmark datasets such as MNIST, +CIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still +achieve almost 100% attack success rate with very small (approximately 10%) +poisoning rates in the clean label setting. The generated trigger of the +proposed attack using one lossy compression algorithm is also transferable +across other related compression algorithms, exacerbating the severity of this +backdoor threat. This work takes another crucial step toward understanding the +extensive risks of backdoor attacks in practice, urging practitioners to +investigate similar attacks and relevant backdoor mitigation methods. + +
+
+ comment: 14 pages. This paper shows everyone can mount a powerful and stealthy + backdoor attack with the widely-used lossy image compression +
+
+
+
+
+ + ☆ Diffusion Inertial Poser: Human Motion Reconstruction from Arbitrary + Sparse IMU Configurations + + +
+ Motion capture from a limited number of inertial measurement units (IMUs) has +important applications in health, human performance, and virtual reality. +Real-world limitations and application-specific goals dictate different IMU +configurations (i.e., number of IMUs and chosen attachment body segments), +trading off accuracy and practicality. Although recent works were successful in +accurately reconstructing whole-body motion from six IMUs, these systems only +work with a specific IMU configuration. Here we propose a single diffusion +generative model, Diffusion Inertial Poser (DiffIP), which reconstructs human +motion in real-time from arbitrary IMU configurations. We show that DiffIP has +the benefit of flexibility with respect to the IMU configuration while being as +accurate as the state-of-the-art for the commonly used six IMU configuration. +Our system enables selecting an optimal configuration for different +applications without retraining the model. For example, when only four IMUs are +available, DiffIP found that the configuration that minimizes errors in joint +kinematics instruments the thighs and forearms. However, global translation +reconstruction is better when instrumenting the feet instead of the thighs. +Although our approach is agnostic to the underlying model, we built DiffIP +based on physiologically realistic musculoskeletal models to enable use in +biomedical research and health applications. + +
+
+
+
+
+ + ☆ SoccerNet 2023 Tracking Challenge -- 3rd place MOT4MOT Team Technical + Report + + +
+ The SoccerNet 2023 tracking challenge requires the detection and tracking of +soccer players and the ball. In this work, we present our approach to tackle +these tasks separately. We employ a state-of-the-art online multi-object +tracker and a contemporary object detector for player tracking. To overcome the +limitations of our online approach, we incorporate a post-processing stage +using interpolation and appearance-free track merging. Additionally, an +appearance-based track merging technique is used to handle the termination and +creation of tracks far from the image boundaries. Ball tracking is formulated +as single object detection, and a fine-tuned YOLOv8l detector with proprietary +filtering improves the detection precision. Our method achieves 3rd place on +the SoccerNet 2023 tracking challenge with a HOTA score of 66.27. + +
+
+ comment: 3 pages, 1 figure +
+
+
+
+
+ + ☆ Learning with Multi-modal Gradient Attention for Explainable Composed + Image Retrieval + + +
+ We consider the problem of composed image retrieval that takes an input query +consisting of an image and a modification text indicating the desired changes +to be made on the image and retrieves images that match these changes. Current +state-of-the-art techniques that address this problem use global features for +the retrieval, resulting in incorrect localization of the regions of interest +to be modified because of the global nature of the features, more so in cases +of real-world, in-the-wild images. Since modifier texts usually correspond to +specific local changes in an image, it is critical that models learn local +features to be able to both localize and retrieve better. To this end, our key +novelty is a new gradient-attention-based learning objective that explicitly +forces the model to focus on the local regions of interest being modified in +each retrieval step. We achieve this by first proposing a new visual image +attention computation technique, which we call multi-modal gradient attention +(MMGrad) that is explicitly conditioned on the modifier text. We next +demonstrate how MMGrad can be incorporated into an end-to-end model training +strategy with a new learning objective that explicitly forces these MMGrad +attention maps to highlight the correct local regions corresponding to the +modifier text. By training retrieval models with this new loss function, we +show improved grounding by means of better visual attention maps, leading to +better explainability of the models as well as competitive quantitative +retrieval performance on standard benchmark datasets. + +
+
+
+
+
+ + ☆ Generate Your Own Scotland: Satellite Image Generation Conditioned on + Maps + + +
+ Despite recent advancements in image generation, diffusion models still +remain largely underexplored in Earth Observation. In this paper we show that +state-of-the-art pretrained diffusion models can be conditioned on cartographic +data to generate realistic satellite images. We provide two large datasets of +paired OpenStreetMap images and satellite views over the region of Mainland +Scotland and the Central Belt. We train a ControlNet model and qualitatively +evaluate the results, demonstrating that both image quality and map fidelity +are possible. Finally, we provide some insights on the opportunities and +challenges of applying these models for remote sensing. Our model weights and +code for creating the dataset are publicly available at +https://github.com/miquel-espinosa/map-sat. + +
+
+ comment: 13 pages, 6 figures. preprint +
+
+
+
+
+ + ☆ Learning Channel Importance for High Content Imaging with Interpretable + Deep Input Channel Mixing + + +
+ Uncovering novel drug candidates for treating complex diseases remain one of +the most challenging tasks in early discovery research. To tackle this +challenge, biopharma research established a standardized high content imaging +protocol that tags different cellular compartments per image channel. In order +to judge the experimental outcome, the scientist requires knowledge about the +channel importance with respect to a certain phenotype for decoding the +underlying biology. In contrast to traditional image analysis approaches, such +experiments are nowadays preferably analyzed by deep learning based approaches +which, however, lack crucial information about the channel importance. To +overcome this limitation, we present a novel approach which utilizes +multi-spectral information of high content images to interpret a certain aspect +of cellular biology. To this end, we base our method on image blending concepts +with alpha compositing for an arbitrary number of channels. More specifically, +we introduce DCMIX, a lightweight, scaleable and end-to-end trainable mixing +layer which enables interpretable predictions in high content imaging while +retaining the benefits of deep learning based methods. We employ an extensive +set of experiments on both MNIST and RXRX1 datasets, demonstrating that DCMIX +learns the biologically relevant channel importance without scarifying +prediction performance. + +
+
+ comment: Accepted @ DAGM German Conference on Pattern Recognition (GCPR) 2023 +
+
+
+
+
+ + ☆ MFR-Net: Multi-faceted Responsive Listening Head Generation via + Denoising Diffusion Model ACM MM 2023 + + +
+ Face-to-face communication is a common scenario including roles of speakers +and listeners. Most existing research methods focus on producing speaker +videos, while the generation of listener heads remains largely overlooked. +Responsive listening head generation is an important task that aims to model +face-to-face communication scenarios by generating a listener head video given +a speaker video and a listener head image. An ideal generated responsive +listening video should respond to the speaker with attitude or viewpoint +expressing while maintaining diversity in interaction patterns and accuracy in +listener identity information. To achieve this goal, we propose the +\textbf{M}ulti-\textbf{F}aceted \textbf{R}esponsive Listening Head Generation +Network (MFR-Net). Specifically, MFR-Net employs the probabilistic denoising +diffusion model to predict diverse head pose and expression features. In order +to perform multi-faceted response to the speaker video, while maintaining +accurate listener identity preservation, we design the Feature Aggregation +Module to boost listener identity features and fuse them with other +speaker-related features. Finally, a renderer finetuned with identity +consistency loss produces the final listening head videos. Our extensive +experiments demonstrate that MFR-Net not only achieves multi-faceted responses +in diversity and speaker identity information but also in attitude and +viewpoint expression. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Semi-Supervised SAR ATR Framework with Transductive Auxiliary + Segmentation + + +
+ Convolutional neural networks (CNNs) have achieved high performance in +synthetic aperture radar (SAR) automatic target recognition (ATR). However, the +performance of CNNs depends heavily on a large amount of training data. The +insufficiency of labeled training SAR images limits the recognition performance +and even invalidates some ATR methods. Furthermore, under few labeled training +data, many existing CNNs are even ineffective. To address these challenges, we +propose a Semi-supervised SAR ATR Framework with transductive Auxiliary +Segmentation (SFAS). The proposed framework focuses on exploiting the +transductive generalization on available unlabeled samples with an auxiliary +loss serving as a regularizer. Through auxiliary segmentation of unlabeled SAR +samples and information residue loss (IRL) in training, the framework can +employ the proposed training loop process and gradually exploit the information +compilation of recognition and segmentation to construct a helpful inductive +bias and achieve high performance. Experiments conducted on the MSTAR dataset +have shown the effectiveness of our proposed SFAS for few-shot learning. The +recognition performance of 94.18\% can be achieved under 20 training samples in +each class with simultaneous accurate segmentation results. Facing variances of +EOCs, the recognition ratios are higher than 88.00\% when 10 training samples +each class. + +
+
+
+
+
+ + ☆ 3D-STMN: Dependency-Driven Superpoint-Text Matching Network for + End-to-End 3D Referring Expression Segmentation + + +
+ In 3D Referring Expression Segmentation (3D-RES), the earlier approach adopts +a two-stage paradigm, extracting segmentation proposals and then matching them +with referring expressions. However, this conventional paradigm encounters +significant challenges, most notably in terms of the generation of lackluster +initial proposals and a pronounced deceleration in inference speed. Recognizing +these limitations, we introduce an innovative end-to-end Superpoint-Text +Matching Network (3D-STMN) that is enriched by dependency-driven insights. One +of the keystones of our model is the Superpoint-Text Matching (STM) mechanism. +Unlike traditional methods that navigate through instance proposals, STM +directly correlates linguistic indications with their respective superpoints, +clusters of semantically related points. This architectural decision empowers +our model to efficiently harness cross-modal semantic relationships, primarily +leveraging densely annotated superpoint-text pairs, as opposed to the more +sparse instance-text pairs. In pursuit of enhancing the role of text in guiding +the segmentation process, we further incorporate the Dependency-Driven +Interaction (DDI) module to deepen the network's semantic comprehension of +referring expressions. Using the dependency trees as a beacon, this module +discerns the intricate relationships between primary terms and their associated +descriptors in expressions, thereby elevating both the localization and +segmentation capacities of our model. Comprehensive experiments on the +ScanRefer benchmark reveal that our model not only set new performance +standards, registering an mIoU gain of 11.7 points but also achieve a +staggering enhancement in inference speed, surpassing traditional methods by +95.7 times. The code and models are available at +https://github.com/sosppxo/3D-STMN. + +
+
+
+
+
+ + ☆ Neural Gradient Regularizer + + +
+ Owing to its significant success, the prior imposed on gradient maps has +consistently been a subject of great interest in the field of image processing. +Total variation (TV), one of the most representative regularizers, is known for +its ability to capture the sparsity of gradient maps. Nonetheless, TV and its +variants often underestimate the gradient maps, leading to the weakening of +edges and details whose gradients should not be zero in the original image. +Recently, total deep variation (TDV) has been introduced, assuming the sparsity +of feature maps, which provides a flexible regularization learned from +large-scale datasets for a specific task. However, TDV requires retraining when +the image or task changes, limiting its versatility. In this paper, we propose +a neural gradient regularizer (NGR) that expresses the gradient map as the +output of a neural network. Unlike existing methods, NGR does not rely on the +sparsity assumption, thereby avoiding the underestimation of gradient maps. NGR +is applicable to various image types and different image processing tasks, +functioning in a zero-shot learning fashion, making it a versatile and +plug-and-play regularizer. Extensive experimental results demonstrate the +superior performance of NGR over state-of-the-art counterparts for a range of +different tasks, further validating its effectiveness and versatility. + +
+
+
+
+
+ + ☆ Detecting Out-of-Context Image-Caption Pairs in News: A + Counter-Intuitive Method + + +
+ The growth of misinformation and re-contextualized media in social media and +news leads to an increasing need for fact-checking methods. Concurrently, the +advancement in generative models makes cheapfakes and deepfakes both easier to +make and harder to detect. In this paper, we present a novel approach using +generative image models to our advantage for detecting Out-of-Context (OOC) use +of images-caption pairs in news. We present two new datasets with a total of +$6800$ images generated using two different generative models including (1) +DALL-E 2, and (2) Stable-Diffusion. We are confident that the method proposed +in this paper can further research on generative models in the field of +cheapfake detection, and that the resulting datasets can be used to train and +evaluate new models aimed at detecting cheapfakes. We run a preliminary +qualitative and quantitative analysis to evaluate the performance of each image +generation model for this task, and evaluate a handful of methods for computing +image similarity. + +
+
+ comment: ACM International Conference on Content-Based Multimedia Indexing + (CBMI '23) +
+
+
+
+
+ + ☆ Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation + + +
+ Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential +role in the early diagnosis and treatment of liver cancer. Deep learning models +backboned by fully convolutional neural networks (FCNNs) have become the +dominant model for segmenting 3D computerized tomography (CT) scans. However, +since their convolution layers suffer from limited kernel size, they are not +able to capture long-range dependencies and global context. To tackle this +restriction, vision transformers have been introduced to solve FCNN's locality +of receptive fields. Although transformers can capture long-range features, +their segmentation performance decreases with various tumor sizes due to the +model sensitivity to the input patch size. While finding an optimal patch size +improves the performance of vision transformer-based models on segmentation +tasks, it is a time-consuming and challenging procedure. This paper proposes a +technique to select the vision transformer's optimal input multi-resolution +image patch size based on the average volume size of metastasis lesions. We +further validated our suggested framework using a transfer-learning technique, +demonstrating that the highest Dice similarity coefficient (DSC) performance +was obtained by pre-training on training data with a larger tumour volume using +the suggested ideal patch size and then training with a smaller one. We +experimentally evaluate this idea through pre-training our model on a +multi-resolution public dataset. Our model showed consistent and improved +results when applied to our private multi-resolution mCRC dataset with a +smaller average tumor volume. This study lays the groundwork for optimizing +semantic segmentation of small objects using vision transformers. The +implementation source code is available +at:https://github.com/Ramtin-Mojtahedi/OVTPS. + +
+
+
+
+
+ + ☆ Any-Size-Diffusion: Toward Efficient Text-Driven Synthesis for Any-Size + HD Images + + +
+ Stable diffusion, a generative model used in text-to-image synthesis, +frequently encounters resolution-induced composition problems when generating +images of varying sizes. This issue primarily stems from the model being +trained on pairs of single-scale images and their corresponding text +descriptions. Moreover, direct training on images of unlimited sizes is +unfeasible, as it would require an immense number of text-image pairs and +entail substantial computational expenses. To overcome these challenges, we +propose a two-stage pipeline named Any-Size-Diffusion (ASD), designed to +efficiently generate well-composed images of any size, while minimizing the +need for high-memory GPU resources. Specifically, the initial stage, dubbed Any +Ratio Adaptability Diffusion (ARAD), leverages a selected set of images with a +restricted range of ratios to optimize the text-conditional diffusion model, +thereby improving its ability to adjust composition to accommodate diverse +image sizes. To support the creation of images at any desired size, we further +introduce a technique called Fast Seamless Tiled Diffusion (FSTD) at the +subsequent stage. This method allows for the rapid enlargement of the ASD +output to any high-resolution size, avoiding seaming artifacts or memory +overloads. Experimental results on the LAION-COCO and MM-CelebA-HQ benchmarks +demonstrate that ASD can produce well-structured images of arbitrary sizes, +cutting down the inference time by 2x compared to the traditional tiled +algorithm. + +
+
+
+
+
+ + ☆ GHuNeRF: Generalizable Human NeRF from a Monocular Video + + +
+ In this paper, we tackle the challenging task of learning a generalizable +human NeRF model from a monocular video. Although existing generalizable human +NeRFs have achieved impressive results, they require muti-view images or videos +which might not be always available. On the other hand, some works on +free-viewpoint rendering of human from monocular videos cannot be generalized +to unseen identities. In view of these limitations, we propose GHuNeRF to learn +a generalizable human NeRF model from a monocular video of the human performer. +We first introduce a visibility-aware aggregation scheme to compute vertex-wise +features, which is used to construct a 3D feature volume. The feature volume +can only represent the overall geometry of the human performer with +insufficient accuracy due to the limited resolution. To solve this, we further +enhance the volume feature with temporally aligned point-wise features using an +attention mechanism. Finally, the enhanced feature is used for predicting +density and color for each sampled point. A surface-guided sampling strategy is +also introduced to improve the efficiency for both training and inference. We +validate our approach on the widely-used ZJU-MoCap dataset, where we achieve +comparable performance with existing multi-view video based approaches. We also +test on the monocular People-Snapshot dataset and achieve better performance +than existing works when only monocular video is used. + +
+
+
+
+
+ + ☆ Dual-Decoder Consistency via Pseudo-Labels Guided Data Augmentation for + Semi-Supervised Medical Image Segmentation + + +
+ Medical image segmentation methods often rely on fully supervised approaches +to achieve excellent performance, which is contingent upon having an extensive +set of labeled images for training. However, annotating medical images is both +expensive and time-consuming. Semi-supervised learning offers a solution by +leveraging numerous unlabeled images alongside a limited set of annotated ones. +In this paper, we introduce a semi-supervised medical image segmentation method +based on the mean-teacher model, referred to as Dual-Decoder Consistency via +Pseudo-Labels Guided Data Augmentation (DCPA). This method combines consistency +regularization, pseudo-labels, and data augmentation to enhance the efficacy of +semi-supervised segmentation. Firstly, the proposed model comprises both +student and teacher models with a shared encoder and two distinct decoders +employing different up-sampling strategies. Minimizing the output discrepancy +between decoders enforces the generation of consistent representations, serving +as regularization during student model training. Secondly, we introduce mixup +operations to blend unlabeled data with labeled data, creating mixed data and +thereby achieving data augmentation. Lastly, pseudo-labels are generated by the +teacher model and utilized as labels for mixed data to compute unsupervised +loss. We compare the segmentation results of the DCPA model with six +state-of-the-art semi-supervised methods on three publicly available medical +datasets. Beyond classical 10\% and 20\% semi-supervised settings, we +investigate performance with less supervision (5\% labeled data). Experimental +outcomes demonstrate that our approach consistently outperforms existing +semi-supervised medical image segmentation methods across the three +semi-supervised settings. + +
+
+
+
+
+ + ☆ CL-MAE: Curriculum-Learned Masked Autoencoders + + +
+ Masked image modeling has been demonstrated as a powerful pretext task for +generating robust representations that can be effectively generalized across +multiple downstream tasks. Typically, this approach involves randomly masking +patches (tokens) in input images, with the masking strategy remaining unchanged +during training. In this paper, we propose a curriculum learning approach that +updates the masking strategy to continually increase the complexity of the +self-supervised reconstruction task. We conjecture that, by gradually +increasing the task complexity, the model can learn more sophisticated and +transferable representations. To facilitate this, we introduce a novel +learnable masking module that possesses the capability to generate masks of +different complexities, and integrate the proposed module into masked +autoencoders (MAE). Our module is jointly trained with the MAE, while adjusting +its behavior during training, transitioning from a partner to the MAE +(optimizing the same reconstruction loss) to an adversary (optimizing the +opposite loss), while passing through a neutral state. The transition between +these behaviors is smooth, being regulated by a factor that is multiplied with +the reconstruction loss of the masking module. The resulting training procedure +generates an easy-to-hard curriculum. We train our Curriculum-Learned Masked +Autoencoder (CL-MAE) on ImageNet and show that it exhibits superior +representation learning capabilities compared to MAE. The empirical results on +five downstream tasks confirm our conjecture, demonstrating that curriculum +learning can be successfully used to self-supervise masked autoencoders. + +
+
+
+
+
+ + ☆ Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based + Approach + + +
+ In the rapidly evolving digital era, the analysis of document layouts plays a +pivotal role in automated information extraction and interpretation. In our +work, we have trained MViTv2 transformer model architecture with cascaded mask +R-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from +a document. After training on 20365 document images for 36 epochs in a 3 phase +cycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work +extends beyond training, delving into the exploration of potential enhancement +avenues. We investigate the impact of rotation and flip augmentation, the +effectiveness of slicing input images pre-inference, the implications of +varying the resolution of the transformer backbone, and the potential of +employing a dual-pass inference to uncover missed text-boxes. Through these +explorations, we observe a spectrum of outcomes, where some modifications +result in tangible performance improvements, while others offer unique insights +for future endeavors. + +
+
+
+
+
+ + ☆ Shape of my heart: Cardiac models through learned signed distance + functions + + +
+ The efficient construction of an anatomical model is one of the major +challenges of patient-specific in-silico models of the human heart. Current +methods frequently rely on linear statistical models, allowing no advanced +topological changes, or requiring medical image segmentation followed by a +meshing pipeline, which strongly depends on image resolution, quality, and +modality. These approaches are therefore limited in their transferability to +other imaging domains. In this work, the cardiac shape is reconstructed by +means of three-dimensional deep signed distance functions with Lipschitz +regularity. For this purpose, the shapes of cardiac MRI reconstructions are +learned from public databases to model the spatial relation of multiple +chambers in Cartesian space. We demonstrate that this approach is also capable +of reconstructing anatomical models from partial data, such as point clouds +from a single ventricle, or modalities different from the trained MRI, such as +electroanatomical mapping, and in addition, allows us to generate new +anatomical shapes by randomly sampling latent vectors. + +
+
+
+
+
+ + ☆ ScrollNet: Dynamic Weight Importance for Continual Learning ICCV2023 + + +
+ The principle underlying most existing continual learning (CL) methods is to +prioritize stability by penalizing changes in parameters crucial to old tasks, +while allowing for plasticity in other parameters. The importance of weights +for each task can be determined either explicitly through learning a +task-specific mask during training (e.g., parameter isolation-based approaches) +or implicitly by introducing a regularization term (e.g., regularization-based +approaches). However, all these methods assume that the importance of weights +for each task is unknown prior to data exposure. In this paper, we propose +ScrollNet as a scrolling neural network for continual learning. ScrollNet can +be seen as a dynamic network that assigns the ranking of weight importance for +each task before data exposure, thus achieving a more favorable +stability-plasticity tradeoff during sequential task learning by reassigning +this ranking for different tasks. Additionally, we demonstrate that ScrollNet +can be combined with various CL methods, including regularization-based and +replay-based approaches. Experimental results on CIFAR100 and TinyImagenet +datasets show the effectiveness of our proposed method. We release our code at +https://github.com/FireFYF/ScrollNet.git. + +
+
+ comment: Accepted at Visual Continual Learning workshop (ICCV2023) +
+
+
+
+
+ + ☆ MoMA: Momentum Contrastive Learning with Multi-head Attention-based + Knowledge Distillation for Histopathology Image Analysis + + +
+ There is no doubt that advanced artificial intelligence models and high +quality data are the keys to success in developing computational pathology +tools. Although the overall volume of pathology data keeps increasing, a lack +of quality data is a common issue when it comes to a specific task due to +several reasons including privacy and ethical issues with patient data. In this +work, we propose to exploit knowledge distillation, i.e., utilize the existing +model to learn a new, target model, to overcome such issues in computational +pathology. Specifically, we employ a student-teacher framework to learn a +target model from a pre-trained, teacher model without direct access to source +data and distill relevant knowledge via momentum contrastive learning with +multi-head attention mechanism, which provides consistent and context-aware +feature representations. This enables the target model to assimilate +informative representations of the teacher model while seamlessly adapting to +the unique nuances of the target data. The proposed method is rigorously +evaluated across different scenarios where the teacher model was trained on the +same, relevant, and irrelevant classification tasks with the target model. +Experimental results demonstrate the accuracy and robustness of our approach in +transferring knowledge to different domains and tasks, outperforming other +related methods. Moreover, the results provide a guideline on the learning +strategy for different types of tasks and scenarios in computational pathology. +Code is available at: \url{https://github.com/trinhvg/MoMA}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ E3CM: Epipolar-Constrained Cascade Correspondence Matching + + +
+ Accurate and robust correspondence matching is of utmost importance for +various 3D computer vision tasks. However, traditional explicit +programming-based methods often struggle to handle challenging scenarios, and +deep learning-based methods require large well-labeled datasets for network +training. In this article, we introduce Epipolar-Constrained Cascade +Correspondence (E3CM), a novel approach that addresses these limitations. +Unlike traditional methods, E3CM leverages pre-trained convolutional neural +networks to match correspondence, without requiring annotated data for any +network training or fine-tuning. Our method utilizes epipolar constraints to +guide the matching process and incorporates a cascade structure for progressive +refinement of matches. We extensively evaluate the performance of E3CM through +comprehensive experiments and demonstrate its superiority over existing +methods. To promote further research and facilitate reproducibility, we make +our source code publicly available at https://mias.group/E3CM. + +
+
+ comment: accepted to Neurocomputing +
+
+
+
+
+ + ☆ Prompt-enhanced Hierarchical Transformer Elevating Cardiopulmonary + Resuscitation Instruction via Temporal Action Segmentation + + +
+ The vast majority of people who suffer unexpected cardiac arrest are +performed cardiopulmonary resuscitation (CPR) by passersby in a desperate +attempt to restore life, but endeavors turn out to be fruitless on account of +disqualification. Fortunately, many pieces of research manifest that +disciplined training will help to elevate the success rate of resuscitation, +which constantly desires a seamless combination of novel techniques to yield +further advancement. To this end, we collect a custom CPR video dataset in +which trainees make efforts to behave resuscitation on mannequins independently +in adherence to approved guidelines, thereby devising an auxiliary toolbox to +assist supervision and rectification of intermediate potential issues via +modern deep learning methodologies. Our research empirically views this problem +as a temporal action segmentation (TAS) task in computer vision, which aims to +segment an untrimmed video at a frame-wise level. Here, we propose a +Prompt-enhanced hierarchical Transformer (PhiTrans) that integrates three +indispensable modules, including a textual prompt-based Video Features +Extractor (VFE), a transformer-based Action Segmentation Executor (ASE), and a +regression-based Prediction Refinement Calibrator (PRC). The backbone of the +model preferentially derives from applications in three approved public +datasets (GTEA, 50Salads, and Breakfast) collected for TAS tasks, which +accounts for the excavation of the segmentation pipeline on the CPR dataset. In +general, we unprecedentedly probe into a feasible pipeline that genuinely +elevates the CPR instruction qualification via action segmentation in +conjunction with cutting-edge deep learning techniques. Associated experiments +advocate our implementation with multiple metrics surpassing 91.0%. + +
+
+ comment: Transformer for Cardiopulmonary Resuscitation +
+
+
+
+
+ + ☆ Object Detection for Caries or Pit and Fissure Sealing Requirement in + Children's First Permanent Molars + + +
+ Dental caries is one of the most common oral diseases that, if left +untreated, can lead to a variety of oral problems. It mainly occurs inside the +pits and fissures on the occlusal/buccal/palatal surfaces of molars and +children are a high-risk group for pit and fissure caries in permanent molars. +Pit and fissure sealing is one of the most effective methods that is widely +used in prevention of pit and fissure caries. However, current detection of +pits and fissures or caries depends primarily on the experienced dentists, +which ordinary parents do not have, and children may miss the remedial +treatment without timely detection. To address this issue, we present a method +to autodetect caries and pit and fissure sealing requirements using oral photos +taken by smartphones. We use the YOLOv5 and YOLOX models and adopt a tiling +strategy to reduce information loss during image pre-processing. The best +result for YOLOXs model with tiling strategy is 72.3 mAP.5, while the best +result without tiling strategy is 71.2. YOLOv5s6 model with/without tiling +attains 70.9/67.9 mAP.5, respectively. We deploy the pre-trained network to +mobile devices as a WeChat applet, allowing in-home detection by parents or +children guardian. + +
+
+
+
+
+ + ☆ Decoupled Local Aggregation for Point Cloud Learning + + +
+ The unstructured nature of point clouds demands that local aggregation be +adaptive to different local structures. Previous methods meet this by +explicitly embedding spatial relations into each aggregation process. Although +this coupled approach has been shown effective in generating clear semantics, +aggregation can be greatly slowed down due to repeated relation learning and +redundant computation to mix directional and point features. In this work, we +propose to decouple the explicit modelling of spatial relations from local +aggregation. We theoretically prove that basic neighbor pooling operations can +too function without loss of clarity in feature fusion, so long as essential +spatial information has been encoded in point features. As an instantiation of +decoupled local aggregation, we present DeLA, a lightweight point network, +where in each learning stage relative spatial encodings are first formed, and +only pointwise convolutions plus edge max-pooling are used for local +aggregation then. Further, a regularization term is employed to reduce +potential ambiguity through the prediction of relative coordinates. +Conceptually simple though, experimental results on five classic benchmarks +demonstrate that DeLA achieves state-of-the-art performance with reduced or +comparable latency. Specifically, DeLA achieves over 90\% overall accuracy on +ScanObjectNN and 74\% mIoU on S3DIS Area 5. Our code is available at +https://github.com/Matrix-ASC/DeLA . + +
+
+
+
+
+ + ☆ Privacy-Preserving Medical Image Classification through Deep Learning + and Matrix Decomposition + + +
+ Deep learning (DL)-based solutions have been extensively researched in the +medical domain in recent years, enhancing the efficacy of diagnosis, planning, +and treatment. Since the usage of health-related data is strictly regulated, +processing medical records outside the hospital environment for developing and +using DL models demands robust data protection measures. At the same time, it +can be challenging to guarantee that a DL solution delivers a minimum level of +performance when being trained on secured data, without being specifically +designed for the given task. Our approach uses singular value decomposition +(SVD) and principal component analysis (PCA) to obfuscate the medical images +before employing them in the DL analysis. The capability of DL algorithms to +extract relevant information from secured data is assessed on a task of +angiographic view classification based on obfuscated frames. The security level +is probed by simulated artificial intelligence (AI)-based reconstruction +attacks, considering two threat actors with different prior knowledge of the +targeted data. The degree of privacy is quantitatively measured using +similarity indices. Although a trade-off between privacy and accuracy should be +considered, the proposed technique allows for training the angiographic view +classifier exclusively on secured data with satisfactory performance and with +no computational overhead, model adaptation, or hyperparameter tuning. While +the obfuscated medical image content is well protected against human +perception, the hypothetical reconstruction attack proved that it is also +difficult to recover the complete information of the original frames. + +
+
+ comment: 6 pages, 9 figures, Published in: 2023 31st Mediterranean Conference + on Control and Automation (MED) +
+
+
+
+
+ + ☆ SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded + Objects + + +
+ To enable meaningful robotic manipulation of objects in the real-world, 6D +pose estimation is one of the critical aspects. Most existing approaches have +difficulties to extend predictions to scenarios where novel object instances +are continuously introduced, especially with heavy occlusions. In this work, we +propose a few-shot pose estimation (FSPE) approach called SA6D, which uses a +self-adaptive segmentation module to identify the novel target object and +construct a point cloud model of the target object using only a small number of +cluttered reference images. Unlike existing methods, SA6D does not require +object-centric reference images or any additional object information, making it +a more generalizable and scalable solution across categories. We evaluate SA6D +on real-world tabletop object datasets and demonstrate that SA6D outperforms +existing FSPE methods, particularly in cluttered scenes with occlusions, while +requiring fewer reference images. + +
+
+
+
+
+ + ☆ Unsupervised Recognition of Unknown Objects for Open-World Object + Detection + + +
+ Open-World Object Detection (OWOD) extends object detection problem to a +realistic and dynamic scenario, where a detection model is required to be +capable of detecting both known and unknown objects and incrementally learning +newly introduced knowledge. Current OWOD models, such as ORE and OW-DETR, focus +on pseudo-labeling regions with high objectness scores as unknowns, whose +performance relies heavily on the supervision of known objects. While they can +detect the unknowns that exhibit similar features to the known objects, they +suffer from a severe label bias problem that they tend to detect all regions +(including unknown object regions) that are dissimilar to the known objects as +part of the background. To eliminate the label bias, this paper proposes a +novel approach that learns an unsupervised discriminative model to recognize +true unknown objects from raw pseudo labels generated by unsupervised region +proposal methods. The resulting model can be further refined by a +classification-free self-training method which iteratively extends pseudo +unknown objects to the unlabeled regions. Experimental results show that our +method 1) significantly outperforms the prior SOTA in detecting unknown objects +while maintaining competitive performance of detecting known object classes on +the MS COCO dataset, and 2) achieves better generalization ability on the LVIS +and Objects365 datasets. + +
+
+
+
+
+ + ☆ MS23D: A 3D Object Detection Method Using Multi-Scale Semantic Feature + Points to Construct 3D Feature Layers + + +
+ Lidar point clouds, as a type of data with accurate distance perception, can +effectively represent the motion and posture of objects in three-dimensional +space. However, the sparsity and disorderliness of point clouds make it +challenging to extract features directly from them. Many studies have addressed +this issue by transforming point clouds into regular voxel representations. +However, these methods often lead to the loss of fine-grained local feature +information due to downsampling. Moreover, the sparsity of point clouds poses +difficulties in efficiently aggregating features in 3D feature layers using +voxel-based two-stage methods. To address these issues, this paper proposes a +two-stage 3D detection framework called MS$^{2}$3D. In MS$^{2}$3D, we utilize +small-sized voxels to extract fine-grained local features and large-sized +voxels to capture long-range local features. Additionally, we propose a method +for constructing 3D feature layers using multi-scale semantic feature points, +enabling the transformation of sparse 3D feature layers into more compact +representations. Furthermore, we compute the offset between feature points in +the 3D feature layers and the centroid of objects, aiming to bring them as +close as possible to the object's center. It significantly enhances the +efficiency of feature aggregation. To validate the effectiveness of our method, +we evaluated our method on the KITTI dataset and ONCE dataset together. + +
+
+
+
+
+ + ☆ MVDream: Multi-view Diffusion for 3D Generation + + +
+ We propose MVDream, a multi-view diffusion model that is able to generate +geometrically consistent multi-view images from a given text prompt. By +leveraging image diffusion models pre-trained on large-scale web datasets and a +multi-view dataset rendered from 3D assets, the resulting multi-view diffusion +model can achieve both the generalizability of 2D diffusion and the consistency +of 3D data. Such a model can thus be applied as a multi-view prior for 3D +generation via Score Distillation Sampling, where it greatly improves the +stability of existing 2D-lifting methods by solving the 3D consistency problem. +Finally, we show that the multi-view diffusion model can also be fine-tuned +under a few shot setting for personalized 3D generation, i.e. DreamBooth3D +application, where the consistency can be maintained after learning the subject +identity. + +
+
+ comment: Our project page is https://MV-Dream.github.io +
+
+
+
+
+ + ☆ Robust GAN inversion + + +
+ Recent advancements in real image editing have been attributed to the +exploration of Generative Adversarial Networks (GANs) latent space. However, +the main challenge of this procedure is GAN inversion, which aims to map the +image to the latent space accurately. Existing methods that work on extended +latent space $W+$ are unable to achieve low distortion and high editability +simultaneously. To address this issue, we propose an approach which works in +native latent space $W$ and tunes the generator network to restore missing +image details. We introduce a novel regularization strategy with learnable +coefficients obtained by training randomized StyleGAN 2 model - WRanGAN. This +method outperforms traditional approaches in terms of reconstruction quality +and computational efficiency, achieving the lowest distortion with 4 times +fewer parameters. Furthermore, we observe a slight improvement in the quality +of constructing hyperplanes corresponding to binary image attributes. We +demonstrate the effectiveness of our approach on two complex datasets: +Flickr-Faces-HQ and LSUN Church. + +
+
+ comment: 22 pages, 28 figures +
+
+
+
+
+ + ☆ Latent Painter + + +
+ Latent diffusers revolutionized the generative AI and inspired creative art. +When denoising the latent, the predicted original image at each step +collectively animates the formation. However, the animation is limited by the +denoising nature of the diffuser, and only renders a sharpening process. This +work presents Latent Painter, which uses the latent as the canvas, and the +diffuser predictions as the plan, to generate painting animation. Latent +Painter also transits one generated image to another, which can happen between +images from two different sets of checkpoints. + +
+
+
+
+
+ + ☆ Illumination Distillation Framework for Nighttime Person + Re-Identification and A New Benchmark + + +
+ Nighttime person Re-ID (person re-identification in the nighttime) is a very +important and challenging task for visual surveillance but it has not been +thoroughly investigated. Under the low illumination condition, the performance +of person Re-ID methods usually sharply deteriorates. To address the low +illumination challenge in nighttime person Re-ID, this paper proposes an +Illumination Distillation Framework (IDF), which utilizes illumination +enhancement and illumination distillation schemes to promote the learning of +Re-ID models. Specifically, IDF consists of a master branch, an illumination +enhancement branch, and an illumination distillation module. The master branch +is used to extract the features from a nighttime image. The illumination +enhancement branch first estimates an enhanced image from the nighttime image +using a nonlinear curve mapping method and then extracts the enhanced features. +However, nighttime and enhanced features usually contain data noise due to +unstable lighting conditions and enhancement failures. To fully exploit the +complementary benefits of nighttime and enhanced features while suppressing +data noise, we propose an illumination distillation module. In particular, the +illumination distillation module fuses the features from two branches through a +bottleneck fusion model and then uses the fused features to guide the learning +of both branches in a distillation manner. In addition, we build a real-world +nighttime person Re-ID dataset, named Night600, which contains 600 identities +captured from different viewpoints and nighttime illumination conditions under +complex outdoor environments. Experimental results demonstrate that our IDF can +achieve state-of-the-art performance on two nighttime person Re-ID datasets +(i.e., Night600 and Knight ). We will release our code and dataset at +https://github.com/Alexadlu/IDF. + +
+
+ comment: Accepted by TMM +
+
+
+
+
+ + ☆ Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning + + +
+ Affordable 3D scanners often produce sparse and non-uniform point clouds that +negatively impact downstream applications in robotic systems. While existing +point cloud upsampling architectures have demonstrated promising results on +standard benchmarks, they tend to experience significant performance drops when +the test data have different distributions from the training data. To address +this issue, this paper proposes a test-time adaption approach to enhance model +generality of point cloud upsampling. The proposed approach leverages +meta-learning to explicitly learn network parameters for test-time adaption. +Our method does not require any prior information about the test data. During +meta-training, the model parameters are learned from a collection of +instance-level tasks, each of which consists of a sparse-dense pair of point +clouds from the training data. During meta-testing, the trained model is +fine-tuned with a few gradient updates to produce a unique set of network +parameters for each test instance. The updated model is then used for the final +prediction. Our framework is generic and can be applied in a plug-and-play +manner with existing backbone networks in point cloud upsampling. Extensive +experiments demonstrate that our approach improves the performance of +state-of-the-art models. + +
+
+
+
+
+ + ☆ Point-TTA: Test-Time Adaptation for Point Cloud Registration Using + Multitask Meta-Auxiliary Learning + + +
+ We present Point-TTA, a novel test-time adaptation framework for point cloud +registration (PCR) that improves the generalization and the performance of +registration models. While learning-based approaches have achieved impressive +progress, generalization to unknown testing environments remains a major +challenge due to the variations in 3D scans. Existing methods typically train a +generic model and the same trained model is applied on each instance during +testing. This could be sub-optimal since it is difficult for the same model to +handle all the variations during testing. In this paper, we propose a test-time +adaptation approach for PCR. Our model can adapt to unseen distributions at +test-time without requiring any prior knowledge of the test data. Concretely, +we design three self-supervised auxiliary tasks that are optimized jointly with +the primary PCR task. Given a test instance, we adapt our model using these +auxiliary tasks and the updated model is used to perform the inference. During +training, our model is trained using a meta-auxiliary learning approach, such +that the adapted model via auxiliary tasks improves the accuracy of the primary +task. Experimental results demonstrate the effectiveness of our approach in +improving generalization of point cloud registration and outperforming other +state-of-the-art approaches. + +
+
+
+
+
+ + ☆ PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction ICCV2023 + + +
+ Vectorized high-definition map online construction has garnered considerable +attention in the field of autonomous driving research. Most existing approaches +model changeable map elements using a fixed number of points, or predict local +maps in a two-stage autoregressive manner, which may miss essential details and +lead to error accumulation. Towards precise map element learning, we propose a +simple yet effective architecture named PivotNet, which adopts unified +pivot-based map representations and is formulated as a direct set prediction +paradigm. Concretely, we first propose a novel Point-to-Line Mask module to +encode both the subordinate and geometrical point-line priors in the network. +Then, a well-designed Pivot Dynamic Matching module is proposed to model the +topology in dynamic point sequences by introducing the concept of sequence +matching. Furthermore, to supervise the position and topology of the vectorized +point predictions, we propose a Dynamic Vectorized Sequence loss. Extensive +experiments and ablations show that PivotNet is remarkably superior to other +SOTAs by 5.9 mAP at least. The code will be available soon. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Self-Sampling Meta SAM: Enhancing Few-shot Medical Image Segmentation + with Meta-Learning + + +
+ While the Segment Anything Model (SAM) excels in semantic segmentation for +general-purpose images, its performance significantly deteriorates when applied +to medical images, primarily attributable to insufficient representation of +medical images in its training dataset. Nonetheless, gathering comprehensive +datasets and training models that are universally applicable is particularly +challenging due to the long-tail problem common in medical images. To address +this gap, here we present a Self-Sampling Meta SAM (SSM-SAM) framework for +few-shot medical image segmentation. Our innovation lies in the design of three +key modules: 1) An online fast gradient descent optimizer, further optimized by +a meta-learner, which ensures swift and robust adaptation to new tasks. 2) A +Self-Sampling module designed to provide well-aligned visual prompts for +improved attention allocation; and 3) A robust attention-based decoder +specifically designed for medical few-shot learning to capture relationship +between different slices. Extensive experiments on a popular abdominal CT +dataset and an MRI dataset demonstrate that the proposed method achieves +significant improvements over state-of-the-art methods in few-shot +segmentation, with an average improvements of 10.21% and 1.80% in terms of DSC, +respectively. In conclusion, we present a novel approach for rapid online +adaptation in interactive image segmentation, adapting to a new organ in just +0.83 minutes. Code is publicly available on GitHub upon acceptance. + +
+
+
+
+
+ + ☆ Sparkles: Unlocking Chats Across Multiple Images for Multimodal + Instruction-Following Models + + +
+ Large language models exhibit enhanced zero-shot performance on various tasks +when fine-tuned with instruction-following data. Multimodal +instruction-following models extend these capabilities by integrating both text +and images. However, existing models such as MiniGPT-4 face challenges in +maintaining dialogue coherence in scenarios involving multiple images. A +primary reason is the lack of a specialized dataset for this critical +application. To bridge these gaps, we present SparklesChat, a multimodal +instruction-following model for open-ended dialogues across multiple images. To +support the training, we introduce SparklesDialogue, the first +machine-generated dialogue dataset tailored for word-level interleaved +multi-image and text interactions. Furthermore, we construct SparklesEval, a +GPT-assisted benchmark for quantitatively assessing a model's conversational +competence across multiple images and dialogue turns. Our experiments validate +the effectiveness of SparklesChat in understanding and reasoning across +multiple images and dialogue turns. Specifically, SparklesChat outperformed +MiniGPT-4 on established vision-and-language benchmarks, including the BISON +binary image selection task and the NLVR2 visual reasoning task. Moreover, +SparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding +MiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative +evaluations further demonstrate SparklesChat's generality in handling +real-world applications. All resources will be available at +https://github.com/HYPJUDY/Sparkles. + +
+
+
+
+
+ + ☆ Domain Adaptive Synapse Detection with Weak Point Annotations + + +
+ The development of learning-based methods has greatly improved the detection +of synapses from electron microscopy (EM) images. However, training a model for +each dataset is time-consuming and requires extensive annotations. +Additionally, it is difficult to apply a learned model to data from different +brain regions due to variations in data distributions. In this paper, we +present AdaSyn, a two-stage segmentation-based framework for domain adaptive +synapse detection with weak point annotations. In the first stage, we address +the detection problem by utilizing a segmentation-based pipeline to obtain +synaptic instance masks. In the second stage, we improve model generalizability +on target data by regenerating square masks to get high-quality pseudo labels. +Benefiting from our high-accuracy detection results, we introduce the distance +nearest principle to match paired pre-synapses and post-synapses. In the +WASPSYN challenge at ISBI 2023, our method ranks the 1st place. + +
+
+
+
+
+ + ☆ Improving Lens Flare Removal with General Purpose Pipeline and Multiple + Light Sources Recovery ICCV 2023 + + +
+ When taking images against strong light sources, the resulting images often +contain heterogeneous flare artifacts. These artifacts can importantly affect +image visual quality and downstream computer vision tasks. While collecting +real data pairs of flare-corrupted/flare-free images for training flare removal +models is challenging, current methods utilize the direct-add approach to +synthesize data. However, these methods do not consider automatic exposure and +tone mapping in image signal processing pipeline (ISP), leading to the limited +generalization capability of deep models training using such data. Besides, +existing methods struggle to handle multiple light sources due to the different +sizes, shapes and illuminance of various light sources. In this paper, we +propose a solution to improve the performance of lens flare removal by +revisiting the ISP and remodeling the principle of automatic exposure in the +synthesis pipeline and design a more reliable light sources recovery strategy. +The new pipeline approaches realistic imaging by discriminating the local and +global illumination through convex combination, avoiding global illumination +shifting and local over-saturation. Our strategy for recovering multiple light +sources convexly averages the input and output of the neural network based on +illuminance levels, thereby avoiding the need for a hard threshold in +identifying light sources. We also contribute a new flare removal testing +dataset containing the flare-corrupted images captured by ten types of consumer +electronics. The dataset facilitates the verification of the generalization +capability of flare removal methods. Extensive experiments show that our +solution can effectively improve the performance of lens flare removal and push +the frontier toward more general situations. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Adversarial Finetuning with Latent Representation Constraint to Mitigate + Accuracy-Robustness Tradeoff ICCV + + +
+ This paper addresses the tradeoff between standard accuracy on clean examples +and robustness against adversarial examples in deep neural networks (DNNs). +Although adversarial training (AT) improves robustness, it degrades the +standard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we +propose a novel AT method called ARREST, which comprises three components: (i) +adversarial finetuning (AFT), (ii) representation-guided knowledge distillation +(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples +by initializing its parameters with a DNN that is standardly pretrained on +clean examples. RGKD and NR respectively entail a regularization term and an +algorithm to preserve latent representations of clean examples during AFT. RGKD +penalizes the distance between the representations of the standardly pretrained +and AFT DNNs. NR switches input adversarial examples to nonadversarial ones +when the representation changes significantly during AFT. By combining these +components, ARREST achieves both high standard accuracy and robustness. +Experimental results demonstrate that ARREST mitigates the tradeoff more +effectively than previous AT-based methods do. + +
+
+ comment: Accepted by International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Njobvu-AI: An open-source tool for collaborative image labeling and + implementation of computer vision models + + +
+ Practitioners interested in using computer vision models lack user-friendly +and open-source software that combines features to label training data, allow +multiple users, train new algorithms, review output, and implement new models. +Labeling training data, such as images, is a key step to developing accurate +object detection algorithms using computer vision. This step is often not +compatible with many cloud-based services for marking or labeling image and +video data due to limited internet bandwidth in many regions of the world. +Desktop tools are useful for groups working in remote locations, but users +often do not have the capability to combine projects developed locally by +multiple collaborators. Furthermore, many tools offer features for labeling +data or using pre-trained models for classification, but few allow researchers +to combine these steps to create and apply custom models. Free, open-source, +and user-friendly software that offers a full suite of features (e.g., ability +to work locally and online, and train custom models) is desirable to field +researchers and conservationists that may have limited coding skills. We +developed Njobvu-AI, a free, open-source tool that can be run on both desktop +and server hardware using Node.js, allowing users to label data, combine +projects for collaboration and review, train custom algorithms, and implement +new computer vision models. The name Njobvu-AI (pronounced N-joh-voo AI), +incorporating the Chichewa word for elephant, is inspired by a wildlife +monitoring program in Malawi that was a primary impetus for the development of +this tool and references similarities between the powerful memory of elephants +and properties of computer vision models. + +
+
+ comment: 13 pages, 6 figures. For code and documentation, see + https://github.com/sullichrosu/Njobvu-AI/ +
+
+
+
+
+ + ☆ Deformation Robust Text Spotting with Geometric Prior + + +
+ The goal of text spotting is to perform text detection and recognition in an +end-to-end manner. Although the diversity of luminosity and orientation in +scene texts has been widely studied, the font diversity and shape variance of +the same character are ignored in recent works, since most characters in +natural images are rendered in standard fonts. To solve this problem, we +present a Chinese Artistic Dataset, termed as ARText, which contains 33,000 +artistic images with rich shape deformation and font diversity. Based on this +database, we develop a deformation robust text spotting method (DR TextSpotter) +to solve the recognition problem of complex deformation of characters in +different fonts. Specifically, we propose a geometric prior module to highlight +the important features based on the unsupervised landmark detection +sub-network. A graph convolution network is further constructed to fuse the +character features and landmark features, and then performs semantic reasoning +to enhance the discrimination for different characters. The experiments are +conducted on ARText and IC19-ReCTS datasets. Our results demonstrate the +effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ RGB-T Tracking via Multi-Modal Mutual Prompt Learning + + +
+ Object tracking based on the fusion of visible and thermal im-ages, known as +RGB-T tracking, has gained increasing atten-tion from researchers in recent +years. How to achieve a more comprehensive fusion of information from the two +modalities with fewer computational costs has been a problem that re-searchers +have been exploring. Recently, with the rise of prompt learning in computer +vision, we can better transfer knowledge from visual large models to downstream +tasks. Considering the strong complementarity between visible and thermal +modalities, we propose a tracking architecture based on mutual prompt learning +between the two modalities. We also design a lightweight prompter that +incorporates attention mechanisms in two dimensions to transfer information +from one modality to the other with lower computational costs, embedding it +into each layer of the backbone. Extensive ex-periments have demonstrated that +our proposed tracking ar-chitecture is effective and efficient, achieving +state-of-the-art performance while maintaining high running speeds. + +
+
+ comment: 9 pages, 5 figures, 5 tables +
+
+
+
+
+ + ☆ Separate and Locate: Rethink the Text in Text-based Visual Question + Answering ACM MM 2023 + + +
+ Text-based Visual Question Answering (TextVQA) aims at answering questions +about the text in images. Most works in this field focus on designing network +structures or pre-training tasks. All these methods list the OCR texts in +reading order (from left to right and top to bottom) to form a sequence, which +is treated as a natural language ``sentence''. However, they ignore the fact +that most OCR words in the TextVQA task do not have a semantical contextual +relationship. In addition, these approaches use 1-D position embedding to +construct the spatial relation between OCR tokens sequentially, which is not +reasonable. The 1-D position embedding can only represent the left-right +sequence relationship between words in a sentence, but not the complex spatial +position relationship. To tackle these problems, we propose a novel method +named Separate and Locate (SaL) that explores text contextual cues and designs +spatial position embedding to construct spatial relations between OCR texts. +Specifically, we propose a Text Semantic Separate (TSS) module that helps the +model recognize whether words have semantic contextual relations. Then, we +introduce a Spatial Circle Position (SCP) module that helps the model better +construct and reason the spatial position relationships between OCR texts. Our +SaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA +and ST-VQA datasets. Compared with the pre-training state-of-the-art method +pre-trained on 64 million pre-training samples, our method, without any +pre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on +TextVQA and ST-VQA. Our code and models will be released at +https://github.com/fangbufang/SaL. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ 3D vision-based structural masonry damage detection + + +
+ The detection of masonry damage is essential for preventing potentially +disastrous outcomes. Manual inspection can, however, take a long time and be +hazardous to human inspectors. Automation of the inspection process using novel +computer vision and machine learning algorithms can be a more efficient and +safe solution to prevent further deterioration of the masonry structures. Most +existing 2D vision-based methods are limited to qualitative damage +classification, 2D localization, and in-plane quantification. In this study, we +present a 3D vision-based methodology for accurate masonry damage detection, +which offers a more robust solution with a greater field of view, depth of +vision, and the ability to detect failures in complex environments. First, +images of the masonry specimens are collected to generate a 3D point cloud. +Second, 3D point clouds processing methods are developed to evaluate the +masonry damage. We demonstrate the effectiveness of our approach through +experiments on structural masonry components. Our experiments showed the +proposed system can effectively classify damage states and localize and +quantify critical damage features. The result showed the proposed method can +improve the level of autonomy during the inspection of masonry structures. + +
+
+ comment: 10 pages, accepted in the Canadian Conference - Pacific Conference on + Earthquake Engineering 2023, Vancouver, British Columbia +
+
+
+
+
+ + ☆ Improving Multiple Sclerosis Lesion Segmentation Across Clinical Sites: + A Federated Learning Approach with Noise-Resilient Training + + +
+ Accurately measuring the evolution of Multiple Sclerosis (MS) with magnetic +resonance imaging (MRI) critically informs understanding of disease progression +and helps to direct therapeutic strategy. Deep learning models have shown +promise for automatically segmenting MS lesions, but the scarcity of accurately +annotated data hinders progress in this area. Obtaining sufficient data from a +single clinical site is challenging and does not address the heterogeneous need +for model robustness. Conversely, the collection of data from multiple sites +introduces data privacy concerns and potential label noise due to varying +annotation standards. To address this dilemma, we explore the use of the +federated learning framework while considering label noise. Our approach +enables collaboration among multiple clinical sites without compromising data +privacy under a federated learning paradigm that incorporates a noise-robust +training strategy based on label correction. Specifically, we introduce a +Decoupled Hard Label Correction (DHLC) strategy that considers the imbalanced +distribution and fuzzy boundaries of MS lesions, enabling the correction of +false annotations based on prediction confidence. We also introduce a Centrally +Enhanced Label Correction (CELC) strategy, which leverages the aggregated +central model as a correction teacher for all sites, enhancing the reliability +of the correction process. Extensive experiments conducted on two multi-site +datasets demonstrate the effectiveness and robustness of our proposed methods, +indicating their potential for clinical applications in multi-site +collaborations. + +
+
+ comment: 11 pages, 4 figures, journal submission +
+
+
+
+
+ + ♻ ☆ Motion Matters: Neural Motion Transfer for Better Camera Physiological + Measurement + + +
+ Machine learning models for camera-based physiological measurement can have +weak generalization due to a lack of representative training data. Body motion +is one of the most significant sources of noise when attempting to recover the +subtle cardiac pulse from a video. We explore motion transfer as a form of data +augmentation to introduce motion variation while preserving physiological +changes of interest. We adapt a neural video synthesis approach to augment +videos for the task of remote photoplethysmography (rPPG) and study the effects +of motion augmentation with respect to 1) the magnitude and 2) the type of +motion. After training on motion-augmented versions of publicly available +datasets, we demonstrate a 47% improvement over existing inter-dataset results +using various state-of-the-art methods on the PURE dataset. We also present +inter-dataset results on five benchmark datasets to show improvements of up to +79% using TS-CAN, a neural rPPG estimation method. Our findings illustrate the +usefulness of motion transfer as a data augmentation technique for improving +the generalization of models for camera-based physiological sensing. We release +our code for using motion transfer as a data augmentation technique on three +publicly available datasets, UBFC-rPPG, PURE, and SCAMPS, and models +pre-trained on motion-augmented data here: https://motion-matters.github.io/ + +
+
+ comment: 17 pages, 6 figures, 15 tables +
+
+
+
+
+ + ♻ ☆ StyleGAN as a Utility-Preserving Face De-identification Method + + +
+ Face de-identification methods have been proposed to preserve users' privacy +by obscuring their faces. These methods, however, can degrade the quality of +photos, and they usually do not preserve the utility of faces, i.e., their age, +gender, pose, and facial expression. Recently, GANs, such as StyleGAN, have +been proposed, which generate realistic, high-quality imaginary faces. In this +paper, we investigate the use of StyleGAN in generating de-identified faces +through style mixing. We examined this de-identification method for preserving +utility and privacy by implementing several face detection, verification, and +identification attacks and conducting a user study. The results from our +extensive experiments, human evaluation, and comparison with two +state-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN +performs on par or better than these methods, preserving users' privacy and +images' utility. In particular, the results of the machine learning-based +experiments show that StyleGAN0-4 preserves utility better than CIAGAN and +DeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves +utility at the same level while providing more privacy. In this paper, for the +first time, we also performed a carefully designed user study to examine both +privacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well +as CIAGAN and DeepPrivacy from the human observers' perspectives. Our +statistical tests showed that participants tend to verify and identify +StyleGAN0-5 images more easily than DeepPrivacy images. All the methods but +StyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding +utility, as expected, StyleGAN0-5 performed significantly better in preserving +some attributes. Among all methods, on average, participants believe gender has +been preserved the most while naturalness has been preserved the least. + +
+
+
+
+
+ + ♻ ☆ Humans in 4D: Reconstructing and Tracking Humans with Transformers ICCV 2023 + + +
+ We present an approach to reconstruct humans and track them over time. At the +core of our approach, we propose a fully "transformerized" version of a network +for human mesh recovery. This network, HMR 2.0, advances the state of the art +and shows the capability to analyze unusual poses that have in the past been +difficult to reconstruct from single images. To analyze video, we use 3D +reconstructions from HMR 2.0 as input to a tracking system that operates in 3D. +This enables us to deal with multiple people and maintain identities through +occlusion events. Our complete approach, 4DHumans, achieves state-of-the-art +results for tracking people from monocular video. Furthermore, we demonstrate +the effectiveness of HMR 2.0 on the downstream task of action recognition, +achieving significant improvements over previous pose-based action recognition +approaches. Our code and models are available on the project website: +https://shubham-goel.github.io/4dhumans/. + +
+
+ comment: In ICCV 2023. Project Webpage: + https://shubham-goel.github.io/4dhumans/ +
+
+
+
+
+ + ♻ ☆ Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave + Communications + + +
+ This study demonstrates the feasibility of point cloud-based proactive link +quality prediction for millimeter-wave (mmWave) communications. Previous +studies have proposed machine learning-based methods to predict received signal +strength for future time periods using time series of depth images to mitigate +the line-of-sight (LOS) path blockage by pedestrians in mmWave communication. +However, these image-based methods have limited applicability due to privacy +concerns as camera images may contain sensitive information. This study +proposes a point cloud-based method for mmWave link quality prediction and +demonstrates its feasibility through experiments. Point clouds represent +three-dimensional (3D) spaces as a set of points and are sparser and less +likely to contain sensitive information than camera images. Additionally, point +clouds provide 3D position and motion information, which is necessary for +understanding the radio propagation environment involving pedestrians. This +study designs the mmWave link quality prediction method and conducts realistic +indoor experiments, where the link quality fluctuates significantly due to +human blockage, using commercially available IEEE 802.11ad-based 60 GHz +wireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light +detection and ranging (LiDAR) for point cloud acquisition. The experimental +results showed that our proposed method can predict future large attenuation of +mmWave received signal strength and throughput induced by the LOS path blockage +by pedestrians with comparable or superior accuracy to image-based prediction +methods. Hence, our point cloud-based method can serve as a viable alternative +to image-based methods. + +
+
+ comment: Submitted to IEEE Transactions on Machine Learning in Communications + and Networking +
+
+
+
+
+ + ♻ ☆ RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation + + +
+ For robots to be useful outside labs and specialized factories we need a way +to teach them new useful behaviors quickly. Current approaches lack either the +generality to onboard new tasks without task-specific engineering, or else lack +the data-efficiency to do so in an amount of time that enables practical use. +In this work we explore dense tracking as a representational vehicle to allow +faster and more general learning from demonstration. Our approach utilizes +Track-Any-Point (TAP) models to isolate the relevant motion in a demonstration, +and parameterize a low-level controller to reproduce this motion across changes +in the scene configuration. We show this results in robust robot policies that +can solve complex object-arrangement tasks such as shape-matching, stacking, +and even full path-following tasks such as applying glue and sticking objects +together, all from demonstrations that can be collected in minutes. + +
+
+ comment: Project website: https://robotap.github.io +
+
+
+
+
+ + ♻ ☆ 6D Object Pose Estimation from Approximate 3D Models for Orbital + Robotics IROS + + +
+ We present a novel technique to estimate the 6D pose of objects from single +images where the 3D geometry of the object is only given approximately and not +as a precise 3D model. To achieve this, we employ a dense 2D-to-3D +correspondence predictor that regresses 3D model coordinates for every pixel. +In addition to the 3D coordinates, our model also estimates the pixel-wise +coordinate error to discard correspondences that are likely wrong. This allows +us to generate multiple 6D pose hypotheses of the object, which we then refine +iteratively using a highly efficient region-based approach. We also introduce a +novel pixel-wise posterior formulation by which we can estimate the probability +for each hypothesis and select the most likely one. As we show in experiments, +our approach is capable of dealing with extreme visual conditions including +overexposure, high contrast, or low signal-to-noise ratio. This makes it a +powerful technique for the particularly challenging task of estimating the pose +of tumbling satellites for in-orbit robotic applications. Our method achieves +state-of-the-art performance on the SPEED+ dataset and has won the SPEC2021 +post-mortem competition. + +
+
+ comment: Proceedings of IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS) +
+
+
+
+
+ + ♻ ☆ DUFormer: Solving Power Line Detection Task in Aerial Images using + Semantic Segmentation + + +
+ Unmanned aerial vehicles (UAVs) are frequently used for inspecting power +lines and capturing high-resolution aerial images. However, detecting power +lines in aerial images is difficult,as the foreground data(i.e, power lines) is +small and the background information is abundant.To tackle this problem, we +introduce DUFormer, a semantic segmentation algorithm explicitly designed to +detect power lines in aerial images. We presuppose that it is advantageous to +train an efficient Transformer model with sufficient feature extraction using a +convolutional neural network(CNN) with a strong inductive bias.With this goal +in mind, we introduce a heavy token encoder that performs overlapping feature +remodeling and tokenization. The encoder comprises a pyramid CNN feature +extraction module and a power line feature enhancement module.After successful +local feature extraction for power lines, feature fusion is conducted.Then,the +Transformer block is used for global modeling. The final segmentation result is +achieved by amalgamating local and global features in the decode head.Moreover, +we demonstrate the importance of the joint multi-weight loss function in power +line segmentation. Our experimental results show that our proposed method +outperforms all state-of-the-art methods in power line segmentation on the +publicly accessible TTPLA dataset. + +
+
+
+
+
+ + ♻ ☆ RBSR: Efficient and Flexible Recurrent Network for Burst + Super-Resolution + + +
+ Burst super-resolution (BurstSR) aims at reconstructing a high-resolution +(HR) image from a sequence of low-resolution (LR) and noisy images, which is +conducive to enhancing the imaging effects of smartphones with limited sensors. +The main challenge of BurstSR is to effectively combine the complementary +information from input frames, while existing methods still struggle with it. +In this paper, we suggest fusing cues frame-by-frame with an efficient and +flexible recurrent network. In particular, we emphasize the role of the +base-frame and utilize it as a key prompt to guide the knowledge acquisition +from other frames in every recurrence. Moreover, we introduce an implicit +weighting loss to improve the model's flexibility in facing input frames with +variable numbers. Extensive experiments on both synthetic and real-world +datasets demonstrate that our method achieves better results than +state-of-the-art ones. Codes and pre-trained models are available at +https://github.com/ZcsrenlongZ/RBSR. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ IML-ViT: Benchmarking Image Manipulation Localization by Vision + Transformer + + +
+ Advanced image tampering techniques are increasingly challenging the +trustworthiness of multimedia, leading to the development of Image Manipulation +Localization (IML). But what makes a good IML model? The answer lies in the way +to capture artifacts. Exploiting artifacts requires the model to extract +non-semantic discrepancies between manipulated and authentic regions, +necessitating explicit comparisons between the two areas. With the +self-attention mechanism, naturally, the Transformer should be a better +candidate to capture artifacts. However, due to limited datasets, there is +currently no pure ViT-based approach for IML to serve as a benchmark, and CNNs +dominate the entire task. Nevertheless, CNNs suffer from weak long-range and +non-semantic modeling. To bridge this gap, based on the fact that artifacts are +sensitive to image resolution, amplified under multi-scale features, and +massive at the manipulation border, we formulate the answer to the former +question as building a ViT with high-resolution capacity, multi-scale feature +extraction capability, and manipulation edge supervision that could converge +with a small amount of data. We term this simple but effective ViT paradigm +IML-ViT, which has significant potential to become a new benchmark for IML. +Extensive experiments on five benchmark datasets verified our model outperforms +the state-of-the-art manipulation localization methods.Code and models are +available at \url{https://github.com/SunnyHaze/IML-ViT}. + +
+
+
+
+
+ + ♻ ☆ Transformer-based interpretable multi-modal data fusion for skin lesion + classification + + +
+ A lot of deep learning (DL) research these days is mainly focused on +improving quantitative metrics regardless of other factors. In human-centered +applications, like skin lesion classification in dermatology, DL-driven +clinical decision support systems are still in their infancy due to the limited +transparency of their decision-making process. Moreover, the lack of procedures +that can explain the behavior of trained DL algorithms leads to almost no trust +from clinical physicians. To diagnose skin lesions, dermatologists rely on +visual assessment of the disease and the data gathered from the patient's +anamnesis. Data-driven algorithms dealing with multi-modal data are limited by +the separation of feature-level and decision-level fusion procedures required +by convolutional architectures. To address this issue, we enable single-stage +multi-modal data fusion via the attention mechanism of transformer-based +architectures to aid in diagnosing skin diseases. Our method beats other +state-of-the-art single- and multi-modal DL architectures in image-rich and +patient-data-rich environments. Additionally, the choice of the architecture +enables native interpretability support for the classification task both in the +image and metadata domain with no additional modifications necessary. + +
+
+ comment: Submitted to IEEE JBHI in July 2023 +
+
+
+
+
+ + ♻ ☆ USAGE: A Unified Seed Area Generation Paradigm for Weakly Supervised + Semantic Segmentation ICCV 2023 + + +
+ Seed area generation is usually the starting point of weakly supervised +semantic segmentation (WSSS). Computing the Class Activation Map (CAM) from a +multi-label classification network is the de facto paradigm for seed area +generation, but CAMs generated from Convolutional Neural Networks (CNNs) and +Transformers are prone to be under- and over-activated, respectively, which +makes the strategies to refine CAMs for CNNs usually inappropriate for +Transformers, and vice versa. In this paper, we propose a Unified optimization +paradigm for Seed Area GEneration (USAGE) for both types of networks, in which +the objective function to be optimized consists of two terms: One is a +generation loss, which controls the shape of seed areas by a temperature +parameter following a deterministic principle for different types of networks; +The other is a regularization loss, which ensures the consistency between the +seed areas that are generated by self-adaptive network adjustment from +different views, to overturn false activation in seed areas. Experimental +results show that USAGE consistently improves seed area generation for both +CNNs and Transformers by large margins, e.g., outperforming state-of-the-art +methods by a mIoU of 4.1% on PASCAL VOC. Moreover, based on the USAGE-generated +seed areas on Transformers, we achieve state-of-the-art WSSS results on both +PASCAL VOC and MS COCO. + +
+
+ comment: ICCV 2023 camera-ready version +
+
+
+
+
+ + ♻ ☆ Leveraging Image-based Generative Adversarial Networks for Time Series + Generation + + +
+ Generative models for images have gained significant attention in computer +vision and natural language processing due to their ability to generate +realistic samples from complex data distributions. To leverage the advances of +image-based generative models for the time series domain, we propose a +two-dimensional image representation for time series, the Extended +Intertemporal Return Plot (XIRP). Our approach captures the intertemporal time +series dynamics in a scale-invariant and invertible way, reducing training time +and improving sample quality. We benchmark synthetic XIRPs obtained by an +off-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image +representations and models regarding similarity and predictive ability metrics. +Our novel, validated image representation for time series consistently and +significantly outperforms a state-of-the-art RNN-based generative model +regarding predictive ability. Further, we introduce an improved stochastic +inversion to substantially improve simulation quality regardless of the +representation and provide the prospect of transfer potentials in other +domains. + +
+
+
+
+
+ + ♻ ☆ Towards Realistic Out-of-Distribution Detection: A Novel Evaluation + Framework for Improving Generalization in OOD Detection + + +
+ This paper presents a novel evaluation framework for Out-of-Distribution +(OOD) detection that aims to assess the performance of machine learning models +in more realistic settings. We observed that the real-world requirements for +testing OOD detection methods are not satisfied by the current testing +protocols. They usually encourage methods to have a strong bias towards a low +level of diversity in normal data. To address this limitation, we propose new +OOD test datasets (CIFAR-10-R, CIFAR-100-R, and ImageNet-30-R) that can allow +researchers to benchmark OOD detection performance under realistic distribution +shifts. Additionally, we introduce a Generalizability Score (GS) to measure the +generalization ability of a model during OOD detection. Our experiments +demonstrate that improving the performance on existing benchmark datasets does +not necessarily improve the usability of OOD detection models in real-world +scenarios. While leveraging deep pre-trained features has been identified as a +promising avenue for OOD detection research, our experiments show that +state-of-the-art pre-trained models tested on our proposed datasets suffer a +significant drop in performance. To address this issue, we propose a +post-processing stage for adapting pre-trained features under these +distribution shifts before calculating the OOD scores, which significantly +enhances the performance of state-of-the-art pre-trained models on our +benchmarks. + +
+
+
+
+
+ + ♻ ☆ LAC: Latent Action Composition for Skeleton-based Action Segmentation ICCV 2023 + + +
+ Skeleton-based action segmentation requires recognizing composable actions in +untrimmed videos. Current approaches decouple this problem by first extracting +local visual features from skeleton sequences and then processing them by a +temporal model to classify frame-wise actions. However, their performances +remain limited as the visual features cannot sufficiently express composable +actions. In this context, we propose Latent Action Composition (LAC), a novel +self-supervised framework aiming at learning from synthesized composable +motions for skeleton-based action segmentation. LAC is composed of a novel +generation module towards synthesizing new sequences. Specifically, we design a +linear latent space in the generator to represent primitive motion. New +composed motions can be synthesized by simply performing arithmetic operations +on latent representations of multiple input skeleton sequences. LAC leverages +such synthesized sequences, which have large diversity and complexity, for +learning visual representations of skeletons in both sequence and frame spaces +via contrastive learning. The resulting visual encoder has a high expressive +power and can be effectively transferred onto action segmentation tasks by +end-to-end fine-tuning without the need for additional temporal models. We +conduct a study focusing on transfer-learning and we show that representations +learned from pre-trained LAC outperform the state-of-the-art by a large margin +on TSU, Charades, PKU-MMD datasets. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Unsupervised Hashing with Similarity Distribution Calibration BMVC 2023 + + +
+ Unsupervised hashing methods typically aim to preserve the similarity between +data points in a feature space by mapping them to binary hash codes. However, +these methods often overlook the fact that the similarity between data points +in the continuous feature space may not be preserved in the discrete hash code +space, due to the limited similarity range of hash codes. The similarity range +is bounded by the code length and can lead to a problem known as similarity +collapse. That is, the positive and negative pairs of data points become less +distinguishable from each other in the hash space. To alleviate this problem, +in this paper a novel Similarity Distribution Calibration (SDC) method is +introduced. SDC aligns the hash code similarity distribution towards a +calibration distribution (e.g., beta distribution) with sufficient spread +across the entire similarity range, thus alleviating the similarity collapse +problem. Extensive experiments show that our SDC outperforms significantly the +state-of-the-art alternatives on coarse category-level and instance-level image +retrieval. Code is available at https://github.com/kamwoh/sdc. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Enhancing the accuracies by performing pooling decisions adjacent to the + output layer + + +
+ Learning classification tasks of (2^nx2^n) inputs typically consist of \le n +(2x2) max-pooling (MP) operators along the entire feedforward deep +architecture. Here we show, using the CIFAR-10 database, that pooling decisions +adjacent to the last convolutional layer significantly enhance accuracies. In +particular, average accuracies of the advanced-VGG with m layers (A-VGGm) +architectures are 0.936, 0.940, 0.954, 0.955, and 0.955 for m=6, 8, 14, 13, and +16, respectively. The results indicate A-VGG8s' accuracy is superior to +VGG16s', and that the accuracies of A-VGG13 and A-VGG16 are equal, and +comparable to that of Wide-ResNet16. In addition, replacing the three fully +connected (FC) layers with one FC layer, A-VGG6 and A-VGG14, or with several +linear activation FC layers, yielded similar accuracies. These significantly +enhanced accuracies stem from training the most influential input-output +routes, in comparison to the inferior routes selected following multiple MP +decisions along the deep architecture. In addition, accuracies are sensitive to +the order of the non-commutative MP and average pooling operators adjacent to +the output layer, varying the number and location of training routes. The +results call for the reexamination of previously proposed deep architectures +and their accuracies by utilizing the proposed pooling strategy adjacent to the +output layer. + +
+
+ comment: 29 pages, 3 figures, 1 table, and Supplementary Information +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision +transformer (CNN-Transformer) for medical image segmentation. The proposed +Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both +the convolution and self-attention mechanisms at each decoding stage with a +nominal memory and computational burden. The inclusion of multi-axis +self-attention, within each decoder stage, significantly enhances the +discriminating capacity between the object and background regions, thereby +helping in improving the segmentation efficiency. In the Hybrid Decoder block, +the fusion process commences by integrating the upsampled lower-level decoder +features, obtained through transpose convolution, with the skip-connection +features derived from the hybrid encoder. Subsequently, the fused features +undergo refinement through the utilization of a multi-axis attention mechanism. +The proposed decoder block is repeated multiple times to progressively segment +the nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset +demonstrates the effectiveness of the proposed technique. Our MaxViT-UNet +outperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet) +techniques by a considerable margin on both of the standard datasets. The +following github (https://github.com/PRLAB21/MaxViT-UNet) contains the +implementation and trained weights. + +
+
+ comment: 17 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised + Real-world Single Image Super-Resolution + + +
+ Single image super-resolution (SISR) is a challenging ill-posed problem that +aims to up-sample a given low-resolution (LR) image to a high-resolution (HR) +counterpart. Due to the difficulty in obtaining real LR-HR training pairs, +recent approaches are trained on simulated LR images degraded by simplified +down-sampling operators, e.g., bicubic. Such an approach can be problematic in +practice because of the large gap between the synthesized and real-world LR +images. To alleviate the issue, we propose a novel Invertible scale-Conditional +Function (ICF), which can scale an input image and then restore the original +input with different scale conditions. By leveraging the proposed ICF, we +construct a novel self-supervised SISR framework (ICF-SRSR) to handle the +real-world SR task without using any paired/unpaired training data. +Furthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs, +which can make existing supervised SISR networks more robust. Extensive +experiments demonstrate the effectiveness of the proposed method in handling +SISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior +performance compared to the existing methods trained on synthetic paired images +in real-world scenarios and exhibits comparable performance compared to +state-of-the-art supervised/unsupervised methods on public benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ RemovalNet: DNN Fingerprint Removal Attacks + + +
+ With the performance of deep neural networks (DNNs) remarkably improving, +DNNs have been widely used in many areas. Consequently, the DNN model has +become a valuable asset, and its intellectual property is safeguarded by +ownership verification techniques (e.g., DNN fingerprinting). However, the +feasibility of the DNN fingerprint removal attack and its potential influence +remains an open problem. In this paper, we perform the first comprehensive +investigation of DNN fingerprint removal attacks. Generally, the knowledge +contained in a DNN model can be categorized into general semantic and +fingerprint-specific knowledge. To this end, we propose a min-max bilevel +optimization-based DNN fingerprint removal attack named RemovalNet, to evade +model ownership verification. The lower-level optimization is designed to +remove fingerprint-specific knowledge. While in the upper-level optimization, +we distill the victim model's general semantic knowledge to maintain the +surrogate model's performance. We conduct extensive experiments to evaluate the +fidelity, effectiveness, and efficiency of the RemovalNet against four advanced +defense methods on six metrics. The empirical results demonstrate that (1) the +RemovalNet is effective. After our DNN fingerprint removal attack, the model +distance between the target and surrogate models is x100 times higher than that +of the baseline attacks, (2) the RemovalNet is efficient. It uses only 0.2% +(400 samples) of the substitute dataset and 1,000 iterations to conduct our +attack. Besides, compared with advanced model stealing attacks, the RemovalNet +saves nearly 85% of computational resources at most, (3) the RemovalNet +achieves high fidelity that the created surrogate model maintains high accuracy +after the DNN fingerprint removal process. Our code is available at: +https://github.com/grasses/RemovalNet. + +
+
+ comment: some mistake +
+
+
+
+
+ + ♻ ☆ FusionBooster: A Unified Image Fusion Boosting Paradigm + + +
+ In recent years, numerous ideas have emerged for designing a mutually +reinforcing mechanism or extra stages for the image fusion task, ignoring the +inevitable gaps between different vision tasks and the computational burden. We +argue that there is a scope to improve the fusion performance with the help of +the FusionBooster, a model specifically designed for the fusion task. In +particular, our booster is based on the divide-and-conquer strategy controlled +by an information probe. The booster is composed of three building blocks: the +probe units, the booster layer, and the assembling module. Given the result +produced by a backbone method, the probe units assess the fused image and +divide the results according to their information content. This is instrumental +in identifying missing information, as a step to its recovery. The recovery of +the degraded components along with the fusion guidance are the role of the +booster layer. Lastly, the assembling module is responsible for piecing these +advanced components together to deliver the output. We use concise +reconstruction loss functions in conjunction with lightweight autoencoder +models to formulate the learning task, with marginal computational complexity +increase. The experimental results obtained in various fusion tasks, as well as +downstream detection tasks, consistently demonstrate that the proposed +FusionBooster significantly improves the performance. Our code will be publicly +available on the project homepage. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Collage Diffusion + + +
+ We seek to give users precise control over diffusion-based image generation +by modeling complex scenes as sequences of layers, which define the desired +spatial arrangement and visual attributes of objects in the scene. Collage +Diffusion harmonizes the input layers to make objects fit together -- the key +challenge involves minimizing changes in the positions and key visual +attributes of the input layers while allowing other attributes to change in the +harmonization process. We ensure that objects are generated in the correct +locations by modifying text-image cross-attention with the layers' alpha masks. +We preserve key visual attributes of input layers by learning specialized text +representations per layer and by extending ControlNet to operate on layers. +Layer input allows users to control the extent of image harmonization on a +per-object basis, and users can even iteratively edit individual objects in +generated images while keeping other objects fixed. By leveraging the rich +information present in layer input, Collage Diffusion generates globally +harmonized images that maintain desired object characteristics better than +prior approaches. + +
+
+
+
+
+ + ♻ ☆ StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent + Disentangled Space + + +
+ One major challenge in machine learning applications is coping with +mismatches between the datasets used in the development and those obtained in +real-world applications. These mismatches may lead to inaccurate predictions +and errors, resulting in poor product quality and unreliable systems. In this +study, we propose StyleDiff to inform developers of the differences between the +two datasets for the steady development of machine learning systems. Using +disentangled image spaces obtained from recently proposed generative models, +StyleDiff compares the two datasets by focusing on attributes in the images and +provides an easy-to-understand analysis of the differences between the +datasets. The proposed StyleDiff performs in $O (d N\log N)$, where $N$ is the +size of the datasets and $d$ is the number of attributes, enabling the +application to large datasets. We demonstrate that StyleDiff accurately detects +differences between datasets and presents them in an understandable format +using, for example, driving scenes datasets. + +
+
+ comment: 25 pages, 17 figures, Image and Vision Computing +
+
+
+
+
+ + ♻ ☆ Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning ICCV 2023 + + +
+ Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful +alternative for full fine-tuning so as to adapt pre-trained vision models to +downstream tasks, which only tunes a small number of parameters while freezing +the vast majority ones to ease storage burden and optimization difficulty. +However, existing PEFT methods introduce trainable parameters to the same +positions across different tasks depending solely on human heuristics and +neglect the domain gaps. To this end, we study where to introduce and how to +allocate trainable parameters by proposing a novel Sensitivity-aware visual +Parameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates +trainable parameters to task-specific important positions given a desired +tunable parameter budget. Specifically, our SPT first quickly identifies the +sensitive parameters that require tuning for a given task in a data-dependent +way. Next, our SPT further boosts the representational capability for the +weight matrices whose number of sensitive parameters exceeds a pre-defined +threshold by utilizing existing structured tuning methods, e.g., LoRA [23] or +Adapter [22], to replace directly tuning the selected sensitive parameters +(unstructured tuning) under the budget. Extensive experiments on a wide range +of downstream recognition tasks show that our SPT is complementary to the +existing PEFT methods and largely boosts their performance, e.g., SPT improves +Adapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean +Top-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks, +respectively. Source code is at https://github.com/ziplab/SPT + +
+
+ comment: ICCV 2023 Oral +
+
+
+
+
+ + ♻ ☆ Improving Underwater Visual Tracking With a Large Scale Dataset and + Image Enhancement + + +
+ This paper presents a new dataset and general tracker enhancement method for +Underwater Visual Object Tracking (UVOT). Despite its significance, underwater +tracking has remained unexplored due to data inaccessibility. It poses distinct +challenges; the underwater environment exhibits non-uniform lighting +conditions, low visibility, lack of sharpness, low contrast, camouflage, and +reflections from suspended particles. Performance of traditional tracking +methods designed primarily for terrestrial or open-air scenarios drops in such +conditions. We address the problem by proposing a novel underwater image +enhancement algorithm designed specifically to boost tracking quality. The +method has resulted in a significant performance improvement, of up to 5.0% +AUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate +UVOT methods, large-scale datasets are required. To this end, we introduce a +large-scale UVOT benchmark dataset consisting of 400 video segments and 275,000 +manually annotated frames enabling underwater training and evaluation of deep +trackers. The videos are labelled with several underwater-specific tracking +attributes including watercolor variation, target distractors, camouflage, +target relative size, and low visibility conditions. The UVOT400 dataset, +tracking results, and the code are publicly available on: +https://github.com/BasitAlawode/UWVOT400. + +
+
+
+
+
+ + ♻ ☆ PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D + Object Detector + + +
+ LIDAR-based 3D object detection and classification is crucial for autonomous +driving. However, inference in real-time from extremely sparse 3D data poses a +formidable challenge. To address this issue, a common approach is to project +point clouds onto a bird's-eye or perspective view, effectively converting them +into an image-like data format. However, this excessive compression of point +cloud data often leads to the loss of information. This paper proposes a 3D +object detector based on voxel and projection double branch feature extraction +(PV-SSD) to address the problem of information loss. We add voxel features +input containing rich local semantic information, which is fully fused with the +projected features in the feature extraction stage to reduce the local +information loss caused by projection. A good performance is achieved compared +to the previous work. In addition, this paper makes the following +contributions: 1) a voxel feature extraction method with variable receptive +fields is proposed; 2) a feature point sampling method by weight sampling is +used to filter out the feature points that are more conducive to the detection +task; 3) the MSSFA module is proposed based on the SSFA module. To verify the +effectiveness of our method, we designed comparison experiments. + +
+
+
+
+
+ + ♻ ☆ SAMedOCT: Adapting Segment Anything Model (SAM) for Retinal OCT + + +
+ The Segment Anything Model (SAM) has gained significant attention in the +field of image segmentation due to its impressive capabilities and prompt-based +interface. While SAM has already been extensively evaluated in various domains, +its adaptation to retinal OCT scans remains unexplored. To bridge this research +gap, we conduct a comprehensive evaluation of SAM and its adaptations on a +large-scale public dataset of OCTs from RETOUCH challenge. Our evaluation +covers diverse retinal diseases, fluid compartments, and device vendors, +comparing SAM against state-of-the-art retinal fluid segmentation methods. +Through our analysis, we showcase adapted SAM's efficacy as a powerful +segmentation model in retinal OCT scans, although still lagging behind +established methods in some circumstances. The findings highlight SAM's +adaptability and robustness, showcasing its utility as a valuable tool in +retinal OCT image analysis and paving the way for further advancements in this +domain. + +
+
+
+
+
+ + ♻ ☆ ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under + Challenging Conditions + + +
+ Robust 3D object detection in extreme weather and illumination conditions is +a challenging task. While radars and thermal cameras are known for their +resilience to these conditions, few studies have been conducted on +radar-thermal fusion due to the lack of corresponding datasets. To address this +gap, we first present a new multi-modal dataset called ThermRad, which includes +a 3D LiDAR, a 4D radar, an RGB camera and a thermal camera. This dataset is +unique because it includes data from all four sensors in extreme weather +conditions, providing a valuable resource for future research in this area. To +validate the robustness of 4D radars and thermal cameras for 3D object +detection in challenging weather conditions, we propose a new multi-modal +fusion method called RTDF-RCNN, which leverages the complementary strengths of +4D radars and thermal cameras to boost object detection performance. To further +prove the effectiveness of our proposed framework, we re-implement +state-of-the-art (SOTA) 3D detectors on our dataset as benchmarks for +evaluation. Our method achieves significant enhancements in detecting cars, +pedestrians, and cyclists, with improvements of over 7.98%, 24.27%, and 27.15%, +respectively, while achieving comparable results to LiDAR-based approaches. Our +contributions in both the ThermRad dataset and the new multi-modal fusion +method provide a new approach to robust 3D object detection in adverse weather +and illumination conditions. The ThermRad dataset will be released. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Learning Deep Intensity Field for Extremely Sparse-View CBCT + Reconstruction MICCAI'23 + + +
+ Sparse-view cone-beam CT (CBCT) reconstruction is an important direction to +reduce radiation dose and benefit clinical applications. Previous voxel-based +generation methods represent the CT as discrete voxels, resulting in high +memory requirements and limited spatial resolution due to the use of 3D +decoders. In this paper, we formulate the CT volume as a continuous intensity +field and develop a novel DIF-Net to perform high-quality CBCT reconstruction +from extremely sparse (fewer than 10) projection views at an ultrafast speed. +The intensity field of a CT can be regarded as a continuous function of 3D +spatial points. Therefore, the reconstruction can be reformulated as regressing +the intensity value of an arbitrary 3D point from given sparse projections. +Specifically, for a point, DIF-Net extracts its view-specific features from +different 2D projection views. These features are subsequently aggregated by a +fusion module for intensity estimation. Notably, thousands of points can be +processed in parallel to improve efficiency during training and testing. In +practice, we collect a knee CBCT dataset to train and evaluate DIF-Net. +Extensive experiments show that our approach can reconstruct CBCT with high +image quality and high spatial resolution from extremely sparse views within +1.6 seconds, significantly outperforming state-of-the-art methods. Our code +will be available at https://github.com/xmed-lab/DIF-Net. + +
+
+ comment: MICCAI'23 +
+
+
+
+
+ + ♻ ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ MacFormer: Map-Agent Coupled Transformer for Real-time and Robust + Trajectory Prediction + + +
+ Predicting the future behavior of agents is a fundamental task in autonomous +vehicle domains. Accurate prediction relies on comprehending the surrounding +map, which significantly regularizes agent behaviors. However, existing methods +have limitations in exploiting the map and exhibit a strong dependence on +historical trajectories, which yield unsatisfactory prediction performance and +robustness. Additionally, their heavy network architectures impede real-time +applications. To tackle these problems, we propose Map-Agent Coupled +Transformer (MacFormer) for real-time and robust trajectory prediction. Our +framework explicitly incorporates map constraints into the network via two +carefully designed modules named coupled map and reference extractor. A novel +multi-task optimization strategy (MTOS) is presented to enhance learning of +topology and rule constraints. We also devise bilateral query scheme in context +fusion for a more efficient and lightweight network. We evaluated our approach +on Argoverse 1, Argoverse 2, and nuScenes real-world benchmarks, where it all +achieved state-of-the-art performance with the lowest inference latency and +smallest model size. Experiments also demonstrate that our framework is +resilient to imperfect tracklet inputs. Furthermore, we show that by combining +with our proposed strategies, classical models outperform their baselines, +further validating the versatility of our framework. + +
+
+ comment: Accepted by IEEE Robotics and Automation Letters. 8 Pages, 9 Figures, + 9 Tables. Video: https://www.youtube.com/watch?v=XY388iI6sPQ +
+
+
+
+
+ + ♻ ☆ Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation + + +
+ Medical image data are often limited due to the expensive acquisition and +annotation process. Hence, training a deep-learning model with only raw data +can easily lead to overfitting. One solution to this problem is to augment the +raw data with various transformations, improving the model's ability to +generalize to new data. However, manually configuring a generic augmentation +combination and parameters for different datasets is non-trivial due to +inconsistent acquisition approaches and data distributions. Therefore, +automatic data augmentation is proposed to learn favorable augmentation +strategies for different datasets while incurring large GPU overhead. To this +end, we present a novel method, called Dynamic Data Augmentation (DDAug), which +is efficient and has negligible computation cost. Our DDAug develops a +hierarchical tree structure to represent various augmentations and utilizes an +efficient Monte-Carlo tree searching algorithm to update, prune, and sample the +tree. As a result, the augmentation pipeline can be optimized for each dataset +automatically. Experiments on multiple Prostate MRI datasets show that our +method outperforms the current state-of-the-art data augmentation strategies. + +
+
+
+
+
+ + ♻ ☆ LRANet: Towards Accurate and Efficient Scene Text Detection with + Low-Rank Approximation Network + + +
+ Recently, regression-based methods, which predict parameterized text shapes +for text localization, have gained popularity in scene text detection. However, +the existing parameterized text shape methods still have limitations in +modeling arbitrary-shaped texts due to ignoring the utilization of +text-specific shape information. Moreover, the time consumption of the entire +pipeline has been largely overlooked, leading to a suboptimal overall inference +speed. To address these issues, we first propose a novel parameterized text +shape method based on low-rank approximation. Unlike other shape representation +methods that employ data-irrelevant parameterization, our approach utilizes +singular value decomposition and reconstructs the text shape using a few +eigenvectors learned from labeled text contours. By exploring the shape +correlation among different text contours, our method achieves consistency, +compactness, simplicity, and robustness in shape representation. Next, we +propose a dual assignment scheme for speed acceleration. It adopts a sparse +assignment branch to accelerate the inference speed, and meanwhile, provides +ample supervised signals for training through a dense assignment branch. +Building upon these designs, we implement an accurate and efficient +arbitrary-shaped text detector named LRANet. Extensive experiments are +conducted on several challenging benchmarks, demonstrating the superior +accuracy and efficiency of LRANet compared to state-of-the-art methods. Code +will be released soon. + +
+
+
+
+
+ + ♻ ☆ Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning + + +
+ Federated learning (FL) enables multiple clients to collaboratively train a +global model without disclosing their data. Previous researches often require +training the complete model parameters. However, the emergence of powerful +pre-trained models makes it possible to achieve higher performance with fewer +learnable parameters in FL. In this paper, we propose a federated adaptive +prompt tuning algorithm, FedAPT, for multi-domain collaborative image +classification with powerful foundation models, like CLIP. Compared with direct +federated prompt tuning, our core idea is to adaptively unlock specific domain +knowledge for each test sample in order to provide them with personalized +prompts. To implement this idea, we design an adaptive prompt tuning module, +which consists of a meta prompt, an adaptive network, and some keys. The server +randomly generates a set of keys and assigns a unique key to each client. Then +all clients cooperatively train the global adaptive network and meta prompt +with the local datasets and the frozen keys. Ultimately, the global aggregation +model can assign a personalized prompt to CLIP based on the domain features of +each test sample. We perform extensive experiments on two multi-domain image +classification datasets across two different settings - supervised and +unsupervised. The results show that FedAPT can achieve better performance with +less than 10\% of the number of parameters of the fully trained model, and the +global model can perform well in diverse client domains simultaneously. + +
+
+
+
+
+ + ♻ ☆ Collaborative Chinese Text Recognition with Personalized Federated + Learning + + +
+ In Chinese text recognition, to compensate for the insufficient local data +and improve the performance of local few-shot character recognition, it is +often necessary for one organization to collect a large amount of data from +similar organizations. However, due to the natural presence of private +information in text data, such as addresses and phone numbers, different +organizations are unwilling to share private data. Therefore, it becomes +increasingly important to design a privacy-preserving collaborative training +framework for the Chinese text recognition task. In this paper, we introduce +personalized federated learning (pFL) into the Chinese text recognition task +and propose the pFedCR algorithm, which significantly improves the model +performance of each client (organization) without sharing private data. +Specifically, pFedCR comprises two stages: multiple rounds of global model +training stage and the the local personalization stage. During stage 1, an +attention mechanism is incorporated into the CRNN model to adapt to various +client data distributions. Leveraging inherent character data characteristics, +a balanced dataset is created on the server to mitigate character imbalance. In +the personalization phase, the global model is fine-tuned for one epoch to +create a local model. Parameter averaging between local and global models +combines personalized and global feature extraction capabilities. Finally, we +fine-tune only the attention layers to enhance its focus on local personalized +features. The experimental results on three real-world industrial scenario +datasets show that the pFedCR algorithm can improve the performance of local +personalized models by about 20\% while also improving their generalization +performance on other client data domains. Compared to other state-of-the-art +personalized federated learning methods, pFedCR improves performance by 6\% +$\sim$ 8\%. + +
+
+
+
+
+ + ♻ ☆ RECLIP: Resource-efficient CLIP by Training with Small Images + + +
+ We present RECLIP (Resource-efficient CLIP), a simple method that minimizes +computational resource footprint for CLIP (Contrastive Language Image +Pretraining). Inspired by the notion of coarse-to-fine in computer vision, we +leverage small images to learn from large-scale language supervision +efficiently, and finetune the model with high-resolution data in the end. Since +the complexity of the vision transformer heavily depends on input image size, +our approach significantly reduces the training resource requirements both in +theory and in practice. Using the same batch size and training epoch, RECLIP +achieves highly competitive zero-shot classification and image-text retrieval +accuracy with 6 to 8x less computational resources and 7 to 9x fewer FLOPs than +the baseline. Compared to the state-of-the-art contrastive learning methods, +RECLIP demonstrates 5 to 59x training resource savings while maintaining highly +competitive zero-shot classification and retrieval performance. Finally, RECLIP +matches the state of the art in transfer learning to open-vocabulary detection +tasks, achieving 32 APr on LVIS. We hope this work will pave the path for the +broader research community to explore language supervised pretraining in +resource-friendly settings. + +
+
+ comment: Published at Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Group DETR: Fast DETR Training with Group-Wise One-to-Many Assignment ICCV23 + + +
+ Detection transformer (DETR) relies on one-to-one assignment, assigning one +ground-truth object to one prediction, for end-to-end detection without NMS +post-processing. It is known that one-to-many assignment, assigning one +ground-truth object to multiple predictions, succeeds in detection methods such +as Faster R-CNN and FCOS. While the naive one-to-many assignment does not work +for DETR, and it remains challenging to apply one-to-many assignment for DETR +training. In this paper, we introduce Group DETR, a simple yet efficient DETR +training approach that introduces a group-wise way for one-to-many assignment. +This approach involves using multiple groups of object queries, conducting +one-to-one assignment within each group, and performing decoder self-attention +separately. It resembles data augmentation with automatically-learned object +query augmentation. It is also equivalent to simultaneously training +parameter-sharing networks of the same architecture, introducing more +supervision and thus improving DETR training. The inference process is the same +as DETR trained normally and only needs one group of queries without any +architecture modification. Group DETR is versatile and is applicable to various +DETR variants. The experiments show that Group DETR significantly speeds up the +training convergence and improves the performance of various DETR-based models. +Code will be available at \url{https://github.com/Atten4Vis/GroupDETR}. + +
+
+ comment: ICCV23 camera ready version +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding and Improving Adversarial + Transferability from Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: IEEE Symposium on Security and Privacy (Oakland) 2024; Extended + version of camera-ready +
+
+
+
+
+ + ♻ ☆ Quaternion-valued Correlation Learning for Few-Shot Semantic + Segmentation + + +
+ Few-shot segmentation (FSS) aims to segment unseen classes given only a few +annotated samples. Encouraging progress has been made for FSS by leveraging +semantic features learned from base classes with sufficient training samples to +represent novel classes. The correlation-based methods lack the ability to +consider interaction of the two subspace matching scores due to the inherent +nature of the real-valued 2D convolutions. In this paper, we introduce a +quaternion perspective on correlation learning and propose a novel +Quaternion-valued Correlation Learning Network (QCLNet), with the aim to +alleviate the computational burden of high-dimensional correlation tensor and +explore internal latent interaction between query and support images by +leveraging operations defined by the established quaternion algebra. +Specifically, our QCLNet is formulated as a hyper-complex valued network and +represents correlation tensors in the quaternion domain, which uses +quaternion-valued convolution to explore the external relations of query +subspace when considering the hidden relationship of the support sub-dimension +in the quaternion space. Extensive experiments on the PASCAL-5i and COCO-20i +datasets demonstrate that our method outperforms the existing state-of-the-art +methods effectively. Our code is available at +https://github.com/zwzheng98/QCLNet and our article "Quaternion-valued +Correlation Learning for Few-Shot Semantic Segmentation" was published in IEEE +Transactions on Circuits and Systems for Video Technology, vol. +33,no.5,pp.2102-2115,May 2023,doi: 10.1109/TCSVT.2022.3223150. + +
+
+ comment: for associated paper file, see + https://ieeexplore.ieee.org/document/9954424?source=authoralert +
+
+
+
+
+ + ♻ ☆ SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space + Reconstruction + + +
+ Segment Anything Model (SAM) has received remarkable attention as it offers a +powerful and versatile solution for object segmentation in images. However, +fine-tuning SAM for downstream segmentation tasks under different scenarios +remains a challenge, as the varied characteristics of different scenarios +naturally requires diverse model parameter spaces. Most existing fine-tuning +methods attempt to bridge the gaps among different scenarios by introducing a +set of new parameters to modify SAM's original parameter space. Unlike these +works, in this paper, we propose fine-tuning SAM efficiently by parameter space +reconstruction (SAM-PARSER), which introduce nearly zero trainable parameters +during fine-tuning. In SAM-PARSER, we assume that SAM's original parameter +space is relatively complete, so that its bases are able to reconstruct the +parameter space of a new scenario. We obtain the bases by matrix decomposition, +and fine-tuning the coefficients to reconstruct the parameter space tailored to +the new scenario by an optimal linear combination of the bases. Experimental +results show that SAM-PARSER exhibits superior segmentation performance across +various scenarios, while reducing the number of trainable parameters by +$\approx 290$ times compared with current parameter-efficient fine-tuning +methods. + +
+
+
+
+
+ + ♻ ☆ Visual correspondence-based explanations improve AI robustness and + human-AI team accuracy NeurIPS 2022 + + +
+ Explaining artificial intelligence (AI) predictions is increasingly important +and even imperative in many high-stakes applications where humans are the +ultimate decision-makers. In this work, we propose two novel architectures of +self-interpretable image classifiers that first explain, and then predict (as +opposed to post-hoc explanations) by harnessing the visual correspondences +between a query image and exemplars. Our models consistently improve (by 1 to 4 +points) on out-of-distribution (OOD) datasets while performing marginally worse +(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest +neighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB, +our correspondence-based explanations are found to be more useful to users than +kNN explanations. Our explanations help users more accurately reject AI's wrong +decisions than all other tested methods. Interestingly, for the first time, we +show that it is possible to achieve complementary human-AI team accuracy (i.e., +that is higher than either AI-alone or human-alone), in ImageNet and CUB image +classification tasks. + +
+
+ comment: NeurIPS 2022 conference paper +
+
+
+
+
+ + ♻ ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ♻ ☆ CircleFormer: Circular Nuclei Detection in Whole Slide Images with + Circle Queries and Attention MICCAI 2023 + + +
+ Both CNN-based and Transformer-based object detection with bounding box +representation have been extensively studied in computer vision and medical +image analysis, but circular object detection in medical images is still +underexplored. Inspired by the recent anchor free CNN-based circular object +detection method (CircleNet) for ball-shape glomeruli detection in renal +pathology, in this paper, we present CircleFormer, a Transformer-based circular +medical object detection with dynamic anchor circles. Specifically, queries +with circle representation in Transformer decoder iteratively refine the +circular object detection results, and a circle cross attention module is +introduced to compute the similarity between circular queries and image +features. A generalized circle IoU (gCIoU) is proposed to serve as a new +regression loss of circular object detection as well. Moreover, our approach is +easy to generalize to the segmentation task by adding a simple segmentation +branch to CircleFormer. We evaluate our method in circular nuclei detection and +segmentation on the public MoNuSeg dataset, and the experimental results show +that our method achieves promising performance compared with the +state-of-the-art approaches. The effectiveness of each component is validated +via ablation studies as well. Our code is released at +https://github.com/zhanghx-iim-ahu/CircleFormer. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ A Region-based Randers Geodesic Approach for Image Segmentation + + +
+ The geodesic model based on the eikonal partial differential equation (PDE) +has served as a fundamental tool for the applications of image segmentation and +boundary detection in the past two decades. However, the existing approaches +commonly only exploit the image edge-based features for computing minimal +geodesic paths, potentially limiting their performance in complicated +segmentation situations. In this paper, we introduce a new variational image +segmentation model based on the minimal geodesic path framework and the eikonal +PDE, where the region-based appearance term that defines then regional +homogeneity features can be taken into account for estimating the associated +minimal geodesic paths. This is done by constructing a Randers geodesic metric +interpretation of the region-based active contour energy functional. As a +result, the minimization of the active contour energy functional is transformed +into finding the solution to the Randers eikonal PDE. + We also suggest a practical interactive image segmentation strategy, where +the target boundary can be delineated by the concatenation of several piecewise +geodesic paths. We invoke the Finsler variant of the fast marching method to +estimate the geodesic distance map, yielding an efficient implementation of the +proposed region-based Randers geodesic model for image segmentation. +Experimental results on both synthetic and real images exhibit that our model +indeed achieves encouraging segmentation performance. + +
+
+ comment: To Appear in International Journal of Computer Vision +
+
+
+
+
+ + ♻ ☆ MMVP: Motion-Matrix-based Video Prediction ICCV 2023 + + +
+ A central challenge of video prediction lies where the system has to reason +the objects' future motions from image frames while simultaneously maintaining +the consistency of their appearances across frames. This work introduces an +end-to-end trainable two-stream video prediction framework, Motion-Matrix-based +Video Prediction (MMVP), to tackle this challenge. Unlike previous methods that +usually handle motion prediction and appearance maintenance within the same set +of modules, MMVP decouples motion and appearance information by constructing +appearance-agnostic motion matrices. The motion matrices represent the temporal +similarity of each and every pair of feature patches in the input frames, and +are the sole input of the motion prediction module in MMVP. This design +improves video prediction in both accuracy and efficiency, and reduces the +model size. Results of extensive experiments demonstrate that MMVP outperforms +state-of-the-art systems on public data sets by non-negligible large margins +(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the +size or smaller). + +
+
+ comment: ICCV 2023 (Oral) +
+
+
+
+
+ + ♻ ☆ Neural Video Compression with Temporal Layer-Adaptive Hierarchical + B-frame Coding + + +
+ Neural video compression (NVC) is a rapidly evolving video coding research +area, with some models achieving superior coding efficiency compared to the +latest video coding standard Versatile Video Coding (VVC). In conventional +video coding standards, the hierarchical B-frame coding, which utilizes a +bidirectional prediction structure for higher compression, had been +well-studied and exploited. In NVC, however, limited research has investigated +the hierarchical B scheme. In this paper, we propose an NVC model exploiting +hierarchical B-frame coding with temporal layer-adaptive optimization. We first +extend an existing unidirectional NVC model to a bidirectional model, which +achieves -21.13% BD-rate gain over the unidirectional baseline model. However, +this model faces challenges when applied to sequences with complex or large +motions, leading to performance degradation. To address this, we introduce +temporal layer-adaptive optimization, incorporating methods such as temporal +layer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent +scaling (TALS). The final model with the proposed methods achieves an +impressive BD-rate gain of -39.86% against the baseline. It also resolves the +challenges in sequences with large or complex motions with up to -49.13% more +BD-rate gains than the simple bidirectional extension. This improvement is +attributed to the allocation of more bits to lower temporal layers, thereby +enhancing overall reconstruction quality with smaller bits. Since our method +has little dependency on a specific NVC model architecture, it can serve as a +general tool for extending unidirectional NVC models to the ones with +hierarchical B-frame coding. + +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Co-evolving Vector Quantization for ID-based Recommendation + + +
+ Category information plays a crucial role in enhancing the quality and +personalization of recommendations. Nevertheless, the availability of item +category information is not consistently present, particularly in the context +of ID-based recommendations. In this work, we propose an alternative approach +to automatically learn and generate entity (i.e., user and item) categorical +information at different levels of granularity, specifically for ID-based +recommendation. Specifically, we devise a co-evolving vector quantization +framework, namely COVE, which enables the simultaneous learning and refinement +of code representation and entity embedding in an end-to-end manner, starting +from the randomly initialized states. With its high adaptability, COVE can be +easily integrated into existing recommendation models. We validate the +effectiveness of COVE on various recommendation tasks including list +completion, collaborative filtering, and click-through rate prediction, across +different recommendation models. We will publish the code and data for other +researchers to reproduce our work. + +
+
+
+
+
+ + ☆ Context Aware Query Rewriting for Text Rankers using LLM + + +
+ Query rewriting refers to an established family of approaches that are +applied to underspecified and ambiguous queries to overcome the vocabulary +mismatch problem in document ranking. Queries are typically rewritten during +query processing time for better query modelling for the downstream ranker. +With the advent of large-language models (LLMs), there have been initial +investigations into using generative approaches to generate pseudo documents to +tackle this inherent vocabulary gap. In this work, we analyze the utility of +LLMs for improved query rewriting for text ranking tasks. We find that there +are two inherent limitations of using LLMs as query re-writers -- concept drift +when using only queries as prompts and large inference costs during query +processing. We adopt a simple, yet surprisingly effective, approach called +context aware query rewriting (CAR) to leverage the benefits of LLMs for query +understanding. Firstly, we rewrite ambiguous training queries by context-aware +prompting of LLMs, where we use only relevant documents as context.Unlike +existing approaches, we use LLM-based query rewriting only during the training +phase. Eventually, a ranker is fine-tuned on the rewritten queries instead of +the original queries during training. In our extensive experiments, we find +that fine-tuning a ranker using re-written queries offers a significant +improvement of up to 33% on the passage ranking task and up to 28% on the +document ranking task when compared to the baseline performance of using +original queries. + +
+
+
+
+
+ + ☆ Concentrating on the Impact: Consequence-based Explanations in + Recommender Systems + + +
+ Recommender systems assist users in decision-making, where the presentation +of recommended items and their explanations are critical factors for enhancing +the overall user experience. Although various methods for generating +explanations have been proposed, there is still room for improvement, +particularly for users who lack expertise in a specific item domain. In this +study, we introduce the novel concept of \textit{consequence-based +explanations}, a type of explanation that emphasizes the individual impact of +consuming a recommended item on the user, which makes the effect of following +recommendations clearer. We conducted an online user study to examine our +assumption about the appreciation of consequence-based explanations and their +impacts on different explanation aims in recommender systems. Our findings +highlight the importance of consequence-based explanations, which were +well-received by users and effectively improved user satisfaction in +recommender systems. These results provide valuable insights for designing +engaging explanations that can enhance the overall user experience in +decision-making. + +
+
+ comment: Preprint of the paper to be presented at IntRS'23: Joint Workshop on + Interfaces and Human Decision Making for Recommender Systems, September 18, + 2023, Singapore. paper will be published in the workshop proceedings +
+
+
+
+
+ + ☆ Towards Long-Tailed Recognition for Graph Classification via + Collaborative Experts + + +
+ Graph classification, aiming at learning the graph-level representations for +effective class assignments, has received outstanding achievements, which +heavily relies on high-quality datasets that have balanced class distribution. +In fact, most real-world graph data naturally presents a long-tailed form, +where the head classes occupy much more samples than the tail classes, it thus +is essential to study the graph-level classification over long-tailed data +while still remaining largely unexplored. However, most existing long-tailed +learning methods in visions fail to jointly optimize the representation +learning and classifier training, as well as neglect the mining of the +hard-to-classify classes. Directly applying existing methods to graphs may lead +to sub-optimal performance, since the model trained on graphs would be more +sensitive to the long-tailed distribution due to the complex topological +characteristics. Hence, in this paper, we propose a novel long-tailed +graph-level classification framework via Collaborative Multi-expert Learning +(CoMe) to tackle the problem. To equilibrate the contributions of head and tail +classes, we first develop balanced contrastive learning from the view of +representation learning, and then design an individual-expert classifier +training based on hard class mining. In addition, we execute gated fusion and +disentangled knowledge distillation among the multiple experts to promote the +collaboration in a multi-expert framework. Comprehensive experiments are +performed on seven widely-used benchmark datasets to demonstrate the +superiority of our method CoMe over state-of-the-art baselines. + +
+
+ comment: Accepted by IEEE Transactions on Big Data (TBD 2024) +
+
+
+
+
+ + ☆ Recommender AI Agent: Integrating Large Language Models for Interactive + Recommendations + + +
+ Recommender models excel at providing domain-specific item recommendations by +leveraging extensive user behavior data. Despite their ability to act as +lightweight domain experts, they struggle to perform versatile tasks such as +providing explanations and engaging in conversations. On the other hand, large +language models (LLMs) represent a significant step towards artificial general +intelligence, showcasing remarkable capabilities in instruction comprehension, +commonsense reasoning, and human interaction. However, LLMs lack the knowledge +of domain-specific item catalogs and behavioral patterns, particularly in areas +that diverge from general world knowledge, such as online e-commerce. +Finetuning LLMs for each domain is neither economic nor efficient. + In this paper, we bridge the gap between recommender models and LLMs, +combining their respective strengths to create a versatile and interactive +recommender system. We introduce an efficient framework called RecAgent, which +employs LLMs as the brain and recommender models as tools. We first outline a +minimal set of essential tools required to transform LLMs into RecAgent. We +then propose an efficient workflow within RecAgent for task execution, +incorporating key components such as a memory bus, dynamic +demonstration-augmented task planning, and reflection. RecAgent enables +traditional recommender systems, such as those ID-based matrix factorization +models, to become interactive systems with a natural language interface through +the integration of LLMs. Experimental results on several public datasets show +that RecAgent achieves satisfying performance as a conversational recommender +system, outperforming general-purpose LLMs. + +
+
+ comment: 16 pages, 15 figures, 4 tables +
+
+
+
+
+ + ☆ AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR + Prediction + + +
+ Click-through rate (CTR) prediction is a crucial issue in recommendation +systems. There has been an emergence of various public CTR datasets. However, +existing datasets primarily suffer from the following limitations. Firstly, +users generally click different types of items from multiple scenarios, and +modeling from multiple scenarios can provide a more comprehensive understanding +of users. Existing datasets only include data for the same type of items from a +single scenario. Secondly, multi-modal features are essential in multi-scenario +prediction as they address the issue of inconsistent ID encoding between +different scenarios. The existing datasets are based on ID features and lack +multi-modal features. Third, a large-scale dataset can provide a more reliable +evaluation of models, fully reflecting the performance differences between +models. The scale of existing datasets is around 100 million, which is +relatively small compared to the real-world CTR prediction. To address these +limitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset +based on industrial data from Alipay. Specifically, AntM$^{2}$C provides the +following advantages: 1) It covers CTR data of 5 different types of items, +providing insights into the preferences of users for different items, including +advertisements, vouchers, mini-programs, contents, and videos. 2) Apart from +ID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text +and image features, which can effectively establish connections between items +with different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200 +features, including 200 million users and 6 million items. It is currently the +largest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several +typical CTR tasks and provide comparisons with baseline methods. The dataset +homepage is available at https://www.atecup.cn/home. + +
+
+
+
+
+ + ♻ ☆ Alleviating Video-Length Effect for Micro-video Recommendation + + +
+ Micro-videos platforms such as TikTok are extremely popular nowadays. One +important feature is that users no longer select interested videos from a set, +instead they either watch the recommended video or skip to the next one. As a +result, the time length of users' watching behavior becomes the most important +signal for identifying preferences. However, our empirical data analysis has +shown a video-length effect that long videos are easier to receive a higher +value of average view time, thus adopting such view-time labels for measuring +user preferences can easily induce a biased model that favors the longer +videos. In this paper, we propose a Video Length Debiasing Recommendation +(VLDRec) method to alleviate such an effect for micro-video recommendation. +VLDRec designs the data labeling approach and the sample generation module that +better capture user preferences in a view-time oriented manner. It further +leverages the multi-task learning technique to jointly optimize the above +samples with original biased ones. Extensive experiments show that VLDRec can +improve the users' view time by 1.81% and 11.32% on two real-world datasets, +given a recommendation list of a fixed overall video length, compared with the +best baseline method. Moreover, VLDRec is also more effective in matching +users' interests in terms of the video content. + +
+
+ comment: Accept by TOIS +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs + + +
+ Data catalogs play a crucial role in modern data-driven organizations by +facilitating the discovery, understanding, and utilization of diverse data +assets. However, ensuring their quality and reliability is complex, especially +in open and large-scale data environments. This paper proposes a framework to +automatically determine the quality of open data catalogs, addressing the need +for efficient and reliable quality assessment mechanisms. Our framework can +analyze various core quality dimensions, such as accuracy, completeness, +consistency, scalability, and timeliness, offer several alternatives for the +assessment of compatibility and similarity across such catalogs as well as the +implementation of a set of non-core quality dimensions such as provenance, +readability, and licensing. The goal is to empower data-driven organizations to +make informed decisions based on trustworthy and well-curated data assets. The +source code that illustrates our approach can be downloaded from +https://www.github.com/jorge-martinez-gil/dataq/. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ Unsupervised Hashing with Similarity Distribution Calibration BMVC 2023 + + +
+ Unsupervised hashing methods typically aim to preserve the similarity between +data points in a feature space by mapping them to binary hash codes. However, +these methods often overlook the fact that the similarity between data points +in the continuous feature space may not be preserved in the discrete hash code +space, due to the limited similarity range of hash codes. The similarity range +is bounded by the code length and can lead to a problem known as similarity +collapse. That is, the positive and negative pairs of data points become less +distinguishable from each other in the hash space. To alleviate this problem, +in this paper a novel Similarity Distribution Calibration (SDC) method is +introduced. SDC aligns the hash code similarity distribution towards a +calibration distribution (e.g., beta distribution) with sufficient spread +across the entire similarity range, thus alleviating the similarity collapse +problem. Extensive experiments show that our SDC outperforms significantly the +state-of-the-art alternatives on coarse category-level and instance-level image +retrieval. Code is available at https://github.com/kamwoh/sdc. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+
+
+
+ + Machine Learning 124 + +
+
+
+ + ☆ A Note on Randomized Kaczmarz Algorithm for Solving Doubly-Noisy Linear + Systems + + +
+ Large-scale linear systems, $Ax=b$, frequently arise in practice and demand +effective iterative solvers. Often, these systems are noisy due to operational +errors or faulty data-collection processes. In the past decade, the randomized +Kaczmarz (RK) algorithm has been studied extensively as an efficient iterative +solver for such systems. However, the convergence study of RK in the noisy +regime is limited and considers measurement noise in the right-hand side +vector, $b$. Unfortunately, in practice, that is not always the case; the +coefficient matrix $A$ can also be noisy. In this paper, we analyze the +convergence of RK for noisy linear systems when the coefficient matrix, $A$, is +corrupted with both additive and multiplicative noise, along with the noisy +vector, $b$. In our analyses, the quantity $\tilde R=\| \tilde A^{\dagger} +\|_2^2 \|\tilde A \|_F^2$ influences the convergence of RK, where $\tilde A$ +represents a noisy version of $A$. We claim that our analysis is robust and +realistically applicable, as we do not require information about the noiseless +coefficient matrix, $A$, and considering different conditions on noise, we can +control the convergence of RK. We substantiate our theoretical findings by +performing comprehensive numerical experiments. + +
+
+
+
+
+ + ☆ Learning to Taste: A Multimodal Wine Dataset + + +
+ We present WineSensed, a large multimodal wine dataset for studying the +relations between visual perception, language, and flavor. The dataset +encompasses 897k images of wine labels and 824k reviews of wines curated from +the Vivino platform. It has over 350k unique vintages, annotated with year, +region, rating, alcohol percentage, price, and grape composition. We obtained +fine-grained flavor annotations on a subset by conducting a wine-tasting +experiment with 256 participants who were asked to rank wines based on their +similarity in flavor, resulting in more than 5k pairwise flavor distances. We +propose a low-dimensional concept embedding algorithm that combines human +experience with automatic machine similarity kernels. We demonstrate that this +shared concept embedding space improves upon separate embedding spaces for +coarse flavor classification (alcohol percentage, country, grape, price, +rating) and aligns with the intricate human perception of flavor. + +
+
+
+
+
+ + ☆ Transformers as Support Vector Machines + + +
+ Since its inception in "Attention Is All You Need", transformer architecture +has led to revolutionary advancements in NLP. The attention layer within the +transformer admits a sequence of input tokens $X$ and makes them interact +through pairwise similarities computed as softmax$(XQK^\top X^\top)$, where +$(K,Q)$ are the trainable key-query parameters. In this work, we establish a +formal equivalence between the optimization geometry of self-attention and a +hard-margin SVM problem that separates optimal input tokens from non-optimal +tokens using linear constraints on the outer-products of token pairs. This +formalism allows us to characterize the implicit bias of 1-layer transformers +optimized with gradient descent: (1) Optimizing the attention layer with +vanishing regularization, parameterized by $(K,Q)$, converges in direction to +an SVM solution minimizing the nuclear norm of the combined parameter +$W=KQ^\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm +objective. We characterize this convergence, highlighting that it can occur +toward locally-optimal directions rather than global ones. (2) Complementing +this, we prove the local/global directional convergence of gradient descent +under suitable geometric conditions. Importantly, we show that +over-parameterization catalyzes global convergence by ensuring the feasibility +of the SVM problem and by guaranteeing a benign optimization landscape devoid +of stationary points. (3) While our theory applies primarily to linear +prediction heads, we propose a more general SVM equivalence that predicts the +implicit bias with nonlinear heads. Our findings are applicable to arbitrary +datasets and their validity is verified via experiments. We also introduce +several open problems and research directions. We believe these findings +inspire the interpretation of transformers as a hierarchy of SVMs that +separates and selects optimal tokens. + +
+
+
+
+
+ + ☆ PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic + Occupancy Prediction + + +
+ Semantic segmentation in autonomous driving has been undergoing an evolution +from sparse point segmentation to dense voxel segmentation, where the objective +is to predict the semantic occupancy of each voxel in the concerned 3D space. +The dense nature of the prediction space has rendered existing efficient +2D-projection-based methods (e.g., bird's eye view, range view, etc.) +ineffective, as they can only describe a subspace of the 3D scene. To address +this, we propose a cylindrical tri-perspective view to represent point clouds +effectively and comprehensively and a PointOcc model to process them +efficiently. Considering the distance distribution of LiDAR point clouds, we +construct the tri-perspective view in the cylindrical coordinate system for +more fine-grained modeling of nearer areas. We employ spatial group pooling to +maintain structural details during projection and adopt 2D backbones to +efficiently process each TPV plane. Finally, we obtain the features of each +point by aggregating its projected features on each of the processed TPV planes +without the need for any post-processing. Extensive experiments on both 3D +occupancy prediction and LiDAR segmentation benchmarks demonstrate that the +proposed PointOcc achieves state-of-the-art performance with much faster speed. +Specifically, despite only using LiDAR, PointOcc significantly outperforms all +other methods, including multi-modal methods, with a large margin on the +OpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc. + +
+
+ comment: Code is available at https://github.com/wzzheng/PointOcc +
+
+
+
+
+ + ☆ Language-Conditioned Path Planning + + +
+ Contact is at the core of robotic manipulation. At times, it is desired (e.g. +manipulation and grasping), and at times, it is harmful (e.g. when avoiding +obstacles). However, traditional path planning algorithms focus solely on +collision-free paths, limiting their applicability in contact-rich tasks. To +address this limitation, we propose the domain of Language-Conditioned Path +Planning, where contact-awareness is incorporated into the path planning +problem. As a first step in this domain, we propose Language-Conditioned +Collision Functions (LACO) a novel approach that learns a collision function +using only a single-view image, language prompt, and robot configuration. LACO +predicts collisions between the robot and the environment, enabling flexible, +conditional path planning without the need for manual object annotations, point +cloud data, or ground-truth object meshes. In both simulation and the real +world, we demonstrate that LACO can facilitate complex, nuanced path plans that +allow for interaction with objects that are safe to collide, rather than +prohibiting any collision. + +
+
+ comment: Conference on Robot Learning, 2023 +
+
+
+
+
+ + ☆ GNFactor: Multi-Task Real Robot Learning with Generalizable Neural + Feature Fields + + +
+ It is a long-standing problem in robotics to develop agents capable of +executing diverse manipulation tasks from visual observations in unstructured +real-world environments. To achieve this goal, the robot needs to have a +comprehensive understanding of the 3D structure and semantics of the scene. In +this work, we present $\textbf{GNFactor}$, a visual behavior cloning agent for +multi-task robotic manipulation with $\textbf{G}$eneralizable $\textbf{N}$eural +feature $\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural +field (GNF) as a reconstruction module and a Perceiver Transformer as a +decision-making module, leveraging a shared deep 3D voxel representation. To +incorporate semantics in 3D, the reconstruction module utilizes a +vision-language foundation model ($\textit{e.g.}$, Stable Diffusion) to distill +rich semantic information into the deep 3D voxel. We evaluate GNFactor on 3 +real robot tasks and perform detailed ablations on 10 RLBench tasks with a +limited number of demonstrations. We observe a substantial improvement of +GNFactor over current state-of-the-art methods in seen and unseen tasks, +demonstrating the strong generalization ability of GNFactor. Our project +website is https://yanjieze.com/GNFactor/ . + +
+
+ comment: CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/ +
+
+
+
+
+ + ☆ Federated Learning in UAV-Enhanced Networks: Joint Coverage and + Convergence Time Optimization + + +
+ Federated learning (FL) involves several devices that collaboratively train a +shared model without transferring their local data. FL reduces the +communication overhead, making it a promising learning method in UAV-enhanced +wireless networks with scarce energy resources. Despite the potential, +implementing FL in UAV-enhanced networks is challenging, as conventional UAV +placement methods that maximize coverage increase the FL delay significantly. +Moreover, the uncertainty and lack of a priori information about crucial +variables, such as channel quality, exacerbate the problem. In this paper, we +first analyze the statistical characteristics of a UAV-enhanced wireless sensor +network (WSN) with energy harvesting. We then develop a model and solution +based on the multi-objective multi-armed bandit theory to maximize the network +coverage while minimizing the FL delay. Besides, we propose another solution +that is particularly useful with large action sets and strict energy +constraints at the UAVs. Our proposal uses a scalarized best-arm identification +algorithm to find the optimal arms that maximize the ratio of the expected +reward to the expected energy cost by sequentially eliminating one or more arms +in each round. Then, we derive the upper bound on the error probability of our +multi-objective and cost-aware algorithm. Numerical results show the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ Prediction of Diblock Copolymer Morphology via Machine Learning + + +
+ A machine learning approach is presented to accelerate the computation of +block polymer morphology evolution for large domains over long timescales. The +strategy exploits the separation of characteristic times between coarse-grained +particle evolution on the monomer scale and slow morphological evolution over +mesoscopic scales. In contrast to empirical continuum models, the proposed +approach learns stochastically driven defect annihilation processes directly +from particle-based simulations. A UNet architecture that respects different +boundary conditions is adopted, thereby allowing periodic and fixed substrate +boundary conditions of arbitrary shape. Physical concepts are also introduced +via the loss function and symmetries are incorporated via data augmentation. +The model is validated using three different use cases. Explainable artificial +intelligence methods are applied to visualize the morphology evolution over +time. This approach enables the generation of large system sizes and long +trajectories to investigate defect densities and their evolution under +different types of confinement. As an application, we demonstrate the +importance of accessing late-stage morphologies for understanding particle +diffusion inside a single block. This work has implications for directed +self-assembly and materials design in micro-electronics, battery materials, and +membranes. + +
+
+ comment: 51 page, 11 Figures and 5 figures in the SI +
+
+
+
+
+ + ☆ The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 + Language Variants + + +
+ We present Belebele, a multiple-choice machine reading comprehension (MRC) +dataset spanning 122 language variants. Significantly expanding the language +coverage of natural language understanding (NLU) benchmarks, this dataset +enables the evaluation of text models in high-, medium-, and low-resource +languages. Each question is based on a short passage from the Flores-200 +dataset and has four multiple-choice answers. The questions were carefully +curated to discriminate between models with different levels of general +language comprehension. The English dataset on its own proves difficult enough +to challenge state-of-the-art language models. Being fully parallel, this +dataset enables direct comparison of model performance across all languages. We +use this dataset to evaluate the capabilities of multilingual masked language +models (MLMs) and large language models (LLMs). We present extensive results +and find that despite significant cross-lingual transfer in English-centric +LLMs, much smaller MLMs pretrained on balanced multilingual data still +understand far more languages. We also observe that larger vocabulary size and +conscious vocabulary construction correlate with better performance on +low-resource languages. Overall, Belebele opens up new avenues for evaluating +and analyzing the multilingual capabilities of NLP systems. + +
+
+ comment: 27 pages, 13 figures +
+
+
+
+
+ + ☆ Information Theoretically Optimal Sample Complexity of Learning + Dynamical Directed Acyclic Graphs + + +
+ In this article, the optimal sample complexity of learning the underlying +interaction/dependencies of a Linear Dynamical System (LDS) over a Directed +Acyclic Graph (DAG) is studied. The sample complexity of learning a DAG's +structure is well-studied for static systems, where the samples of nodal states +are independent and identically distributed (i.i.d.). However, such a study is +less explored for DAGs with dynamical systems, where the nodal states are +temporally correlated. We call such a DAG underlying an LDS as \emph{dynamical} +DAG (DDAG). In particular, we consider a DDAG where the nodal dynamics are +driven by unobserved exogenous noise sources that are wide-sense stationary +(WSS) in time but are mutually uncorrelated, and have the same {power spectral +density (PSD)}. Inspired by the static settings, a metric and an algorithm +based on the PSD matrix of the observed time series are proposed to reconstruct +the DDAG. The equal noise PSD assumption can be relaxed such that +identifiability conditions for DDAG reconstruction are not violated. For the +LDS with WSS (sub) Gaussian exogenous noise sources, it is shown that the +optimal sample complexity (or length of state trajectory) needed to learn the +DDAG is $n=\Theta(q\log(p/q))$, where $p$ is the number of nodes and $q$ is the +maximum number of parents per node. To prove the sample complexity upper bound, +a concentration bound for the PSD estimation is derived, under two different +sampling strategies. A matching min-max lower bound using generalized Fano's +inequality also is provided, thus showing the order optimality of the proposed +algorithm. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ Majorization-Minimization for sparse SVMs + + +
+ Several decades ago, Support Vector Machines (SVMs) were introduced for +performing binary classification tasks, under a supervised framework. Nowadays, +they often outperform other supervised methods and remain one of the most +popular approaches in the machine learning arena. In this work, we investigate +the training of SVMs through a smooth sparse-promoting-regularized squared +hinge loss minimization. This choice paves the way to the application of quick +training methods built on majorization-minimization approaches, benefiting from +the Lipschitz differentiabililty of the loss function. Moreover, the proposed +approach allows us to handle sparsity-preserving regularizers promoting the +selection of the most significant features, so enhancing the performance. +Numerical tests and comparisons conducted on three different datasets +demonstrate the good performance of the proposed methodology in terms of +qualitative metrics (accuracy, precision, recall, and F 1 score) as well as +computational cost. + +
+
+
+
+
+ + ☆ Natural Quantum Monte Carlo Computation of Excited States + + +
+ We present a variational Monte Carlo algorithm for estimating the lowest +excited states of a quantum system which is a natural generalization of the +estimation of ground states. The method has no free parameters and requires no +explicit orthogonalization of the different states, instead transforming the +problem of finding excited states of a given system into that of finding the +ground state of an expanded system. Expected values of arbitrary observables +can be calculated, including off-diagonal expectations between different states +such as the transition dipole moment. Although the method is entirely general, +it works particularly well in conjunction with recent work on using neural +networks as variational Ansatze for many-electron systems, and we show that by +combining this method with the FermiNet and Psiformer Ansatze we can accurately +recover vertical excitation energies and oscillator strengths on molecules as +large as benzene. Beyond the examples on molecules presented here, we expect +this technique will be of great interest for applications of variational +quantum Monte Carlo to atomic, nuclear and condensed matter physics. + +
+
+
+
+
+ + ☆ Diffusion Models for Interferometric Satellite Aperture Radar + + +
+ Probabilistic Diffusion Models (PDMs) have recently emerged as a very +promising class of generative models, achieving high performance in natural +image generation. However, their performance relative to non-natural images, +like radar-based satellite data, remains largely unknown. Generating large +amounts of synthetic (and especially labelled) satellite data is crucial to +implement deep-learning approaches for the processing and analysis of +(interferometric) satellite aperture radar data. Here, we leverage PDMs to +generate several radar-based satellite image datasets. We show that PDMs +succeed in generating images with complex and realistic structures, but that +sampling time remains an issue. Indeed, accelerated sampling strategies, which +work well on simple image datasets like MNIST, fail on our radar datasets. We +provide a simple and versatile open-source +https://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and +evaluate PDMs using any dataset on a single GPU. + +
+
+
+
+
+ + ☆ FedDD: Toward Communication-efficient Federated Learning with + Differential Parameter Dropout + + +
+ Federated Learning (FL) requires frequent exchange of model parameters, which +leads to long communication delay, especially when the network environments of +clients vary greatly. Moreover, the parameter server needs to wait for the +slowest client (i.e., straggler, which may have the largest model size, lowest +computing capability or worst network condition) to upload parameters, which +may significantly degrade the communication efficiency. Commonly-used client +selection methods such as partial client selection would lead to the waste of +computing resources and weaken the generalization of the global model. To +tackle this problem, along a different line, in this paper, we advocate the +approach of model parameter dropout instead of client selection, and +accordingly propose a novel framework of Federated learning scheme with +Differential parameter Dropout (FedDD). FedDD consists of two key modules: +dropout rate allocation and uploaded parameter selection, which will optimize +the model parameter uploading ratios tailored to different clients' +heterogeneous conditions and also select the proper set of important model +parameters for uploading subject to clients' dropout rate constraints. +Specifically, the dropout rate allocation is formulated as a convex +optimization problem, taking system heterogeneity, data heterogeneity, and +model heterogeneity among clients into consideration. The uploaded parameter +selection strategy prioritizes on eliciting important parameters for uploading +to speedup convergence. Furthermore, we theoretically analyze the convergence +of the proposed FedDD scheme. Extensive performance evaluations demonstrate +that the proposed FedDD scheme can achieve outstanding performances in both +communication efficiency and model convergence, and also possesses a strong +generalization capability to data of rare classes. + +
+
+
+
+
+ + ☆ Latent Variable Multi-output Gaussian Processes for Hierarchical + Datasets + + +
+ Multi-output Gaussian processes (MOGPs) have been introduced to deal with +multiple tasks by exploiting the correlations between different outputs. +Generally, MOGPs models assume a flat correlation structure between the +outputs. However, such a formulation does not account for more elaborate +relationships, for instance, if several replicates were observed for each +output (which is a typical setting in biological experiments). This paper +proposes an extension of MOGPs for hierarchical datasets (i.e. datasets for +which the relationships between observations can be represented within a tree +structure). Our model defines a tailored kernel function accounting for +hierarchical structures in the data to capture different levels of correlations +while leveraging the introduction of latent variables to express the underlying +dependencies between outputs through a dedicated kernel. This latter feature is +expected to significantly improve scalability as the number of tasks increases. +An extensive experimental study involving both synthetic and real-world data +from genomics and motion capture is proposed to support our claims. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Irregular Traffic Time Series Forecasting Based on Asynchronous + Spatio-Temporal Graph Convolutional Network + + +
+ Accurate traffic forecasting at intersections governed by intelligent traffic +signals is critical for the advancement of an effective intelligent traffic +signal control system. However, due to the irregular traffic time series +produced by intelligent intersections, the traffic forecasting task becomes +much more intractable and imposes three major new challenges: 1) asynchronous +spatial dependency, 2) irregular temporal dependency among traffic data, and 3) +variable-length sequence to be predicted, which severely impede the performance +of current traffic forecasting methods. To this end, we propose an Asynchronous +Spatio-tEmporal graph convolutional nEtwoRk (ASeer) to predict the traffic +states of the lanes entering intelligent intersections in a future time window. +Specifically, by linking lanes via a traffic diffusion graph, we first propose +an Asynchronous Graph Diffusion Network to model the asynchronous spatial +dependency between the time-misaligned traffic state measurements of lanes. +After that, to capture the temporal dependency within irregular traffic state +sequence, a learnable personalized time encoding is devised to embed the +continuous time for each lane. Then we propose a Transformable Time-aware +Convolution Network that learns meta-filters to derive time-aware convolution +filters with transformable filter sizes for efficient temporal convolution on +the irregular sequence. Furthermore, a Semi-Autoregressive Prediction Network +consisting of a state evolution unit and a semiautoregressive predictor is +designed to effectively and efficiently predict variable-length traffic state +sequences. Extensive experiments on two real-world datasets demonstrate the +effectiveness of ASeer in six metrics. + +
+
+
+
+
+ + ☆ Rank Collapse Causes Over-Smoothing and Over-Correlation in Graph Neural + Networks + + +
+ Our study reveals new theoretical insights into over-smoothing and feature +over-correlation in deep graph neural networks. We show the prevalence of +invariant subspaces, demonstrating a fixed relative behavior that is unaffected +by feature transformations. Our work clarifies recent observations related to +convergence to a constant state and a potential over-separation of node states, +as the amplification of subspaces only depends on the spectrum of the +aggregation function. In linear scenarios, this leads to node representations +being dominated by a low-dimensional subspace with an asymptotic convergence +rate independent of the feature transformations. This causes a rank collapse of +the node representations, resulting in over-smoothing when smooth vectors span +this subspace, and over-correlation even when over-smoothing is avoided. Guided +by our theory, we propose a sum of Kronecker products as a beneficial property +that can provably prevent over-smoothing, over-correlation, and rank collapse. +We empirically extend our insights to the non-linear case, demonstrating the +inability of existing models to capture linearly independent features. + +
+
+
+
+
+ + ☆ Joint Semantic-Native Communication and Inference via Minimal Simplicial + Structures + + +
+ In this work, we study the problem of semantic communication and inference, +in which a student agent (i.e. mobile device) queries a teacher agent (i.e. +cloud sever) to generate higher-order data semantics living in a simplicial +complex. Specifically, the teacher first maps its data into a k-order +simplicial complex and learns its high-order correlations. For effective +communication and inference, the teacher seeks minimally sufficient and +invariant semantic structures prior to conveying information. These minimal +simplicial structures are found via judiciously removing simplices selected by +the Hodge Laplacians without compromising the inference query accuracy. +Subsequently, the student locally runs its own set of queries based on a masked +simplicial convolutional autoencoder (SCAE) leveraging both local and remote +teacher's knowledge. Numerical results corroborate the effectiveness of the +proposed approach in terms of improving inference query accuracy under +different channel conditions and simplicial structures. Experiments on a +coauthorship dataset show that removing simplices by ranking the Laplacian +values yields a 85% reduction in payload size without sacrificing accuracy. +Joint semantic communication and inference by masked SCAE improves query +accuracy by 25% compared to local student based query and 15% compared to +remote teacher based query. Finally, incorporating channel semantics is shown +to effectively improve inference accuracy, notably at low SNR values. + +
+
+
+
+
+ + ☆ StratMed: Relevance Stratification for Low-resource Medication + Recommendation + + +
+ With the growing imbalance between limited medical resources and escalating +demands, AI-based clinical tasks have become paramount. Medication +recommendation, as a sub-domain, aims to amalgamate longitudinal patient +history with medical knowledge, assisting physicians in prescribing safer and +more accurate medication combinations. Existing methods overlook the inherent +long-tail distribution in medical data, lacking balanced representation between +head and tail data, which leads to sub-optimal model performance. To address +this challenge, we introduce StratMed, a model that incorporates an innovative +relevance stratification mechanism. It harmonizes discrepancies in data +long-tail distribution and strikes a balance between the safety and accuracy of +medication combinations. Specifically, we first construct a pre-training method +using deep learning networks to obtain entity representation. After that, we +design a pyramid-like data stratification method to obtain more generalized +entity relationships by reinforcing the features of unpopular entities. Based +on this relationship, we designed two graph structures to express medication +precision and safety at the same level to obtain visit representations. +Finally, the patient's historical clinical information is fitted to generate +medication combinations for the current health condition. Experiments on the +MIMIC-III dataset demonstrate that our method has outperformed current +state-of-the-art methods in four evaluation metrics (including safety and +accuracy). + +
+
+
+
+
+ + ☆ Efficacy of Neural Prediction-Based NAS for Zero-Shot NAS Paradigm + + +
+ In prediction-based Neural Architecture Search (NAS), performance indicators +derived from graph convolutional networks have shown significant success. These +indicators, achieved by representing feed-forward structures as component +graphs through one-hot encoding, face a limitation: their inability to evaluate +architecture performance across varying search spaces. In contrast, handcrafted +performance indicators (zero-shot NAS), which use the same architecture with +random initialization, can generalize across multiple search spaces. Addressing +this limitation, we propose a novel approach for zero-shot NAS using deep +learning. Our method employs Fourier sum of sines encoding for convolutional +kernels, enabling the construction of a computational feed-forward graph with a +structure similar to the architecture under evaluation. These encodings are +learnable and offer a comprehensive view of the architecture's topological +information. An accompanying multi-layer perceptron (MLP) then ranks these +architectures based on their encodings. Experimental results show that our +approach surpasses previous methods using graph convolutional networks in terms +of correlation on the NAS-Bench-201 dataset and exhibits a higher convergence +rate. Moreover, our extracted feature representation trained on each +NAS-Benchmark is transferable to other NAS-Benchmarks, showing promising +generalizability across multiple search spaces. The code is available at: +https://github.com/minh1409/DFT-NPZS-NAS + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Constructing Indoor Region-based Radio Map without Location Labels + + +
+ Radio map construction requires a large amount of radio measurement data with +location labels, which imposes a high deployment cost. This paper develops a +region-based radio map from received signal strength (RSS) measurements without +location labels. The construction is based on a set of blindly collected RSS +measurement data from a device that visits each region in an indoor area +exactly once, where the footprints and timestamps are not recorded. The main +challenge is to cluster the RSS data and match clusters with the physical +regions. Classical clustering algorithms fail to work as the RSS data naturally +appears as non-clustered due to multipaths and noise. In this paper, a signal +subspace model with a sequential prior is constructed for the RSS data, and an +integrated segmentation and clustering algorithm is developed, which is shown +to find the globally optimal solution in a special case. Furthermore, the +clustered data is matched with the physical regions using a graph-based +approach. Based on real measurements from an office space, the proposed scheme +reduces the region localization error by roughly 50% compared to a weighted +centroid localization (WCL) baseline, and it even outperforms some supervised +localization schemes, including k-nearest neighbor (KNN), support vector +machine (SVM), and deep neural network (DNN), which require labeled data for +training. + +
+
+
+
+
+ + ☆ Training Neural Networks Using Reproducing Kernel Space Interpolation + and Model Reduction + + +
+ We introduce and study the theory of training neural networks using +interpolation techniques from reproducing kernel Hilbert space theory. We +generalize the method to Krein spaces, and show that widely-used neural network +architectures are subsets of reproducing kernel Krein spaces (RKKS). We study +the concept of "associated Hilbert spaces" of RKKS and develop techniques to +improve upon the expressivity of various activation functions. Next, using +concepts from the theory of functions of several complex variables, we prove a +computationally applicable, multidimensional generalization of the celebrated +Adamjan- Arov-Krein (AAK) theorem. The theorem yields a novel class of neural +networks, called Prolongation Neural Networks (PNN). We demonstrate that, by +applying the multidimensional AAK theorem to gain a PNN, one can gain +performance superior to both our interpolatory methods and current +state-of-the-art methods in noisy environments. We provide useful illustrations +of our methods in practice. + +
+
+
+
+
+ + ☆ Moreau Envelope ADMM for Decentralized Weakly Convex Optimization + + +
+ This paper proposes a proximal variant of the alternating direction method of +multipliers (ADMM) for distributed optimization. Although the current versions +of ADMM algorithm provide promising numerical results in producing solutions +that are close to optimal for many convex and non-convex optimization problems, +it remains unclear if they can converge to a stationary point for weakly convex +and locally non-smooth functions. Through our analysis using the Moreau +envelope function, we demonstrate that MADM can indeed converge to a stationary +point under mild conditions. Our analysis also includes computing the bounds on +the amount of change in the dual variable update step by relating the gradient +of the Moreau envelope function to the proximal function. Furthermore, the +results of our numerical experiments indicate that our method is faster and +more robust than widely-used approaches. + +
+
+
+
+
+ + ☆ US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for + Cervical Lymph Node Lesions Diagnoses in Ultrasound Images + + +
+ Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph +node lesions. However, the diagnoses of these images largely hinge on the +expertise of medical practitioners, rendering the process susceptible to +misdiagnoses. Although rapidly developing deep learning has substantially +improved the diagnoses of diverse ultrasound images, there remains a +conspicuous research gap concerning cervical lymph nodes. The objective of our +work is to accurately diagnose cervical lymph node lesions by leveraging a deep +learning model. To this end, we first collected 3392 images containing normal +lymph nodes, benign lymph node lesions, malignant primary lymph node lesions, +and malignant metastatic lymph node lesions. Given that ultrasound images are +generated by the reflection and scattering of sound waves across varied bodily +tissues, we proposed the Conv-FFT Block. It integrates convolutional operations +with the fast Fourier transform to more astutely model the images. Building +upon this foundation, we designed a novel architecture, named US-SFNet. This +architecture not only discerns variances in ultrasound images from the spatial +domain but also adeptly captures microstructural alterations across various +lesions in the frequency domain. To ascertain the potential of US-SFNet, we +benchmarked it against 12 popular architectures through five-fold +cross-validation. The results show that US-SFNet is SOTA and can achieve 92.89% +accuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity, +respectively. + +
+
+
+
+
+ + ☆ Robust Networked Federated Learning for Localization + + +
+ This paper addresses the problem of localization, which is inherently +non-convex and non-smooth in a federated setting where the data is distributed +across a multitude of devices. Due to the decentralized nature of federated +environments, distributed learning becomes essential for scalability and +adaptability. Moreover, these environments are often plagued by outlier data, +which presents substantial challenges to conventional methods, particularly in +maintaining estimation accuracy and ensuring algorithm convergence. To mitigate +these challenges, we propose a method that adopts an $L_1$-norm robust +formulation within a distributed sub-gradient framework, explicitly designed to +handle these obstacles. Our approach addresses the problem in its original +form, without resorting to iterative simplifications or approximations, +resulting in enhanced computational efficiency and improved estimation +accuracy. We demonstrate that our method converges to a stationary point, +highlighting its effectiveness and reliability. Through numerical simulations, +we confirm the superior performance of our approach, notably in outlier-rich +environments, which surpasses existing state-of-the-art localization methods. + +
+
+
+
+
+ + ☆ Robust Representation Learning for Unreliable Partial Label Learning + + +
+ Partial Label Learning (PLL) is a type of weakly supervised learning where +each training instance is assigned a set of candidate labels, but only one +label is the ground-truth. However, this idealistic assumption may not always +hold due to potential annotation inaccuracies, meaning the ground-truth may not +be present in the candidate label set. This is known as Unreliable Partial +Label Learning (UPLL) that introduces an additional complexity due to the +inherent unreliability and ambiguity of partial labels, often resulting in a +sub-optimal performance with existing methods. To address this challenge, we +propose the Unreliability-Robust Representation Learning framework (URRL) that +leverages unreliability-robust contrastive learning to help the model fortify +against unreliable partial labels effectively. Concurrently, we propose a dual +strategy that combines KNN-based candidate label set correction and +consistency-regularization-based label disambiguation to refine label quality +and enhance the ability of representation learning within the URRL framework. +Extensive experiments demonstrate that the proposed method outperforms +state-of-the-art PLL methods on various datasets with diverse degrees of +unreliability and ambiguity. Furthermore, we provide a theoretical analysis of +our approach from the perspective of the expectation maximization (EM) +algorithm. Upon acceptance, we pledge to make the code publicly accessible. + +
+
+
+
+
+ + ☆ Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor + Attack + + +
+ The vulnerabilities to backdoor attacks have recently threatened the +trustworthiness of machine learning models in practical applications. +Conventional wisdom suggests that not everyone can be an attacker since the +process of designing the trigger generation algorithm often involves +significant effort and extensive experimentation to ensure the attack's +stealthiness and effectiveness. Alternatively, this paper shows that there +exists a more severe backdoor threat: anyone can exploit an easily-accessible +algorithm for silent backdoor attacks. Specifically, this attacker can employ +the widely-used lossy image compression from a plethora of compression tools to +effortlessly inject a trigger pattern into an image without leaving any +noticeable trace; i.e., the generated triggers are natural artifacts. One does +not require extensive knowledge to click on the "convert" or "save as" button +while using tools for lossy image compression. Via this attack, the adversary +does not need to design a trigger generator as seen in prior works and only +requires poisoning the data. Empirically, the proposed attack consistently +achieves 100% attack success rate in several benchmark datasets such as MNIST, +CIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still +achieve almost 100% attack success rate with very small (approximately 10%) +poisoning rates in the clean label setting. The generated trigger of the +proposed attack using one lossy compression algorithm is also transferable +across other related compression algorithms, exacerbating the severity of this +backdoor threat. This work takes another crucial step toward understanding the +extensive risks of backdoor attacks in practice, urging practitioners to +investigate similar attacks and relevant backdoor mitigation methods. + +
+
+ comment: 14 pages. This paper shows everyone can mount a powerful and stealthy + backdoor attack with the widely-used lossy image compression +
+
+
+
+
+ + ☆ Everything, Everywhere All in One Evaluation: Using Multiverse Analysis + to Evaluate the Influence of Model Design Decisions on Algorithmic Fairness + + +
+ A vast number of systems across the world use algorithmic decision making +(ADM) to (partially) automate decisions that have previously been made by +humans. When designed well, these systems promise more objective decisions +while saving large amounts of resources and freeing up human time. However, +when ADM systems are not designed well, they can lead to unfair decisions which +discriminate against societal groups. The downstream effects of ADMs critically +depend on the decisions made during the systems' design and implementation, as +biases in data can be mitigated or reinforced along the modeling pipeline. Many +of these design decisions are made implicitly, without knowing exactly how they +will influence the final system. It is therefore important to make explicit the +decisions made during the design of ADM systems and understand how these +decisions affect the fairness of the resulting system. + To study this issue, we draw on insights from the field of psychology and +introduce the method of multiverse analysis for algorithmic fairness. In our +proposed method, we turn implicit design decisions into explicit ones and +demonstrate their fairness implications. By combining decisions, we create a +grid of all possible "universes" of decision combinations. For each of these +universes, we compute metrics of fairness and performance. Using the resulting +dataset, one can see how and which decisions impact fairness. We demonstrate +how multiverse analyses can be used to better understand variability and +robustness of algorithmic fairness using an exemplary case study of predicting +public health coverage of vulnerable populations for potential interventions. +Our results illustrate how decisions during the design of a machine learning +system can have surprising effects on its fairness and how to detect these +effects using multiverse analysis. + +
+
+
+
+
+ + ☆ Branches of a Tree: Taking Derivatives of Programs with Discrete and + Branching Randomness in High Energy Physics + + +
+ We propose to apply several gradient estimation techniques to enable the +differentiation of programs with discrete randomness in High Energy Physics. +Such programs are common in High Energy Physics due to the presence of +branching processes and clustering-based analysis. Thus differentiating such +programs can open the way for gradient based optimization in the context of +detector design optimization, simulator tuning, or data analysis and +reconstruction optimization. We discuss several possible gradient estimation +strategies, including the recent Stochastic AD method, and compare them in +simplified detector design experiments. In doing so we develop, to the best of +our knowledge, the first fully differentiable branching program. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Dynamic nsNet2: Efficient Deep Noise Suppression with Early Exiting SP 2023 + + +
+ Although deep learning has made strides in the field of deep noise +suppression, leveraging deep architectures on resource-constrained devices +still proved challenging. Therefore, we present an early-exiting model based on +nsNet2 that provides several levels of accuracy and resource savings by halting +computations at different stages. Moreover, we adapt the original architecture +by splitting the information flow to take into account the injected dynamism. +We show the trade-offs between performance and computational complexity based +on established metrics. + +
+
+ comment: Accepted at the MLSP 2023 +
+
+
+
+
+ + ☆ Communication-Efficient Decentralized Federated Learning via One-Bit + Compressive Sensing + + +
+ Decentralized federated learning (DFL) has gained popularity due to its +practicality across various applications. Compared to the centralized version, +training a shared model among a large number of nodes in DFL is more +challenging, as there is no central server to coordinate the training process. +Especially when distributed nodes suffer from limitations in communication or +computational resources, DFL will experience extremely inefficient and unstable +training. Motivated by these challenges, in this paper, we develop a novel +algorithm based on the framework of the inexact alternating direction method +(iADM). On one hand, our goal is to train a shared model with a sparsity +constraint. This constraint enables us to leverage one-bit compressive sensing +(1BCS), allowing transmission of one-bit information among neighbour nodes. On +the other hand, communication between neighbour nodes occurs only at certain +steps, reducing the number of communication rounds. Therefore, the algorithm +exhibits notable communication efficiency. Additionally, as each node selects +only a subset of neighbours to participate in the training, the algorithm is +robust against stragglers. Additionally, complex items are computed only once +for several consecutive steps and subproblems are solved inexactly using +closed-form solutions, resulting in high computational efficiency. Finally, +numerical experiments showcase the algorithm's effectiveness in both +communication and computation. + +
+
+
+
+
+ + ☆ What can we learn from quantum convolutional neural networks? + + +
+ We can learn from analyzing quantum convolutional neural networks (QCNNs) +that: 1) working with quantum data can be perceived as embedding physical +system parameters through a hidden feature map; 2) their high performance for +quantum phase recognition can be attributed to generation of a very suitable +basis set during the ground state embedding, where quantum criticality of spin +models leads to basis functions with rapidly changing features; 3) pooling +layers of QCNNs are responsible for picking those basis functions that can +contribute to forming a high-performing decision boundary, and the learning +process corresponds to adapting the measurement such that few-qubit operators +are mapped to full-register observables; 4) generalization of QCNN models +strongly depends on the embedding type, and that rotation-based feature maps +with the Fourier basis require careful feature engineering; 5) accuracy and +generalization of QCNNs with readout based on a limited number of shots favor +the ground state embeddings and associated physics-informed models. We +demonstrate these points in simulation, where our results shed light on +classification for physical processes, relevant for applications in sensing. +Finally, we show that QCNNs with properly chosen ground state embeddings can be +used for fluid dynamics problems, expressing shock wave solutions with good +generalization and proven trainability. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Autoencoder-based Online Data Quality Monitoring for the CMS + Electromagnetic Calorimeter + + +
+ The online Data Quality Monitoring system (DQM) of the CMS electromagnetic +calorimeter (ECAL) is a crucial operational tool that allows ECAL experts to +quickly identify, localize, and diagnose a broad range of detector issues that +would otherwise hinder physics-quality data taking. Although the existing ECAL +DQM system has been continuously updated to respond to new problems, it remains +one step behind newer and unforeseen issues. Using unsupervised deep learning, +a real-time autoencoder-based anomaly detection system is developed that is +able to detect ECAL anomalies unseen in past data. After accounting for spatial +variations in the response of the ECAL and the temporal evolution of anomalies, +the new system is able to efficiently detect anomalies while maintaining an +estimated false discovery rate between $10^{-2}$ to $10^{-4}$, beating existing +benchmarks by about two orders of magnitude. The real-world performance of the +system is validated using anomalies found in 2018 and 2022 LHC collision data. +Additionally, first results from deploying the autoencoder-based system in the +CMS online DQM workflow for the ECAL barrel during Run 3 of the LHC are +presented, showing its promising performance in detecting obscure issues that +could have been missed in the existing DQM system. + +
+
+ comment: Submitted to the Proceedings of 21st International Workshop on + Advanced Computing and Analysis Techniques in Physics Research ACAT 2022 + conference +
+
+
+
+
+ + ☆ Generate Your Own Scotland: Satellite Image Generation Conditioned on + Maps + + +
+ Despite recent advancements in image generation, diffusion models still +remain largely underexplored in Earth Observation. In this paper we show that +state-of-the-art pretrained diffusion models can be conditioned on cartographic +data to generate realistic satellite images. We provide two large datasets of +paired OpenStreetMap images and satellite views over the region of Mainland +Scotland and the Central Belt. We train a ControlNet model and qualitatively +evaluate the results, demonstrating that both image quality and map fidelity +are possible. Finally, we provide some insights on the opportunities and +challenges of applying these models for remote sensing. Our model weights and +code for creating the dataset are publicly available at +https://github.com/miquel-espinosa/map-sat. + +
+
+ comment: 13 pages, 6 figures. preprint +
+
+
+
+
+ + ☆ Towards Long-Tailed Recognition for Graph Classification via + Collaborative Experts + + +
+ Graph classification, aiming at learning the graph-level representations for +effective class assignments, has received outstanding achievements, which +heavily relies on high-quality datasets that have balanced class distribution. +In fact, most real-world graph data naturally presents a long-tailed form, +where the head classes occupy much more samples than the tail classes, it thus +is essential to study the graph-level classification over long-tailed data +while still remaining largely unexplored. However, most existing long-tailed +learning methods in visions fail to jointly optimize the representation +learning and classifier training, as well as neglect the mining of the +hard-to-classify classes. Directly applying existing methods to graphs may lead +to sub-optimal performance, since the model trained on graphs would be more +sensitive to the long-tailed distribution due to the complex topological +characteristics. Hence, in this paper, we propose a novel long-tailed +graph-level classification framework via Collaborative Multi-expert Learning +(CoMe) to tackle the problem. To equilibrate the contributions of head and tail +classes, we first develop balanced contrastive learning from the view of +representation learning, and then design an individual-expert classifier +training based on hard class mining. In addition, we execute gated fusion and +disentangled knowledge distillation among the multiple experts to promote the +collaboration in a multi-expert framework. Comprehensive experiments are +performed on seven widely-used benchmark datasets to demonstrate the +superiority of our method CoMe over state-of-the-art baselines. + +
+
+ comment: Accepted by IEEE Transactions on Big Data (TBD 2024) +
+
+
+
+
+ + ☆ A Causal Discovery Approach To Learn How Urban Form Shapes Sustainable + Mobility Across Continents + + +
+ Global sustainability requires low-carbon urban transport systems, shaped by +adequate infrastructure, deployment of low-carbon transport modes and shifts in +travel behavior. To adequately implement alterations in infrastructure, it's +essential to grasp the location-specific cause-and-effect mechanisms that the +constructed environment has on travel. Yet, current research falls short in +representing causal relationships between the 6D urban form variables and +travel, generalizing across different regions, and modeling urban form effects +at high spatial resolution. Here, we address all three gaps by utilizing a +causal discovery and an explainable machine learning framework to detect urban +form effects on intra-city travel based on high-resolution mobility data of six +cities across three continents. We show that both distance to city center, +demographics and density indirectly affect other urban form features. By +considering the causal relationships, we find that location-specific influences +align across cities, yet vary in magnitude. In addition, the spread of the city +and the coverage of jobs across the city are the strongest determinants of +travel-related emissions, highlighting the benefits of compact development and +associated benefits. Differences in urban form effects across the cities call +for a more holistic definition of 6D measures. Our work is a starting point for +location-specific analysis of urban form effects on mobility behavior using +causal discovery approaches, which is highly relevant for city planners and +municipalities across continents. + +
+
+ comment: 22 pages, 13 figures, 4 tables +
+
+
+
+
+ + ☆ Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation + + +
+ Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential +role in the early diagnosis and treatment of liver cancer. Deep learning models +backboned by fully convolutional neural networks (FCNNs) have become the +dominant model for segmenting 3D computerized tomography (CT) scans. However, +since their convolution layers suffer from limited kernel size, they are not +able to capture long-range dependencies and global context. To tackle this +restriction, vision transformers have been introduced to solve FCNN's locality +of receptive fields. Although transformers can capture long-range features, +their segmentation performance decreases with various tumor sizes due to the +model sensitivity to the input patch size. While finding an optimal patch size +improves the performance of vision transformer-based models on segmentation +tasks, it is a time-consuming and challenging procedure. This paper proposes a +technique to select the vision transformer's optimal input multi-resolution +image patch size based on the average volume size of metastasis lesions. We +further validated our suggested framework using a transfer-learning technique, +demonstrating that the highest Dice similarity coefficient (DSC) performance +was obtained by pre-training on training data with a larger tumour volume using +the suggested ideal patch size and then training with a smaller one. We +experimentally evaluate this idea through pre-training our model on a +multi-resolution public dataset. Our model showed consistent and improved +results when applied to our private multi-resolution mCRC dataset with a +smaller average tumor volume. This study lays the groundwork for optimizing +semantic segmentation of small objects using vision transformers. The +implementation source code is available +at:https://github.com/Ramtin-Mojtahedi/OVTPS. + +
+
+
+
+
+ + ☆ Towards Spontaneous Style Modeling with Semi-supervised Pre-training for + Conversational Text-to-Speech Synthesis INTERSPEECH 2023 + + +
+ The spontaneous behavior that often occurs in conversations makes speech more +human-like compared to reading-style. However, synthesizing spontaneous-style +speech is challenging due to the lack of high-quality spontaneous datasets and +the high cost of labeling spontaneous behavior. In this paper, we propose a +semi-supervised pre-training method to increase the amount of spontaneous-style +speech and spontaneous behavioral labels. In the process of semi-supervised +learning, both text and speech information are considered for detecting +spontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is +used to model the relationship between each sentence in the conversation. +Experimental results indicate that our proposed method achieves superior +expressive speech synthesis performance with the ability to model spontaneous +behavior in spontaneous-style speech and predict reasonable spontaneous +behavior from text. + +
+
+ comment: Accepted by INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Development and validation of an interpretable machine learning-based + calculator for predicting 5-year weight trajectories after bariatric surgery: + a multinational retrospective cohort SOPHIA study + + +
+ Background Weight loss trajectories after bariatric surgery vary widely +between individuals, and predicting weight loss before the operation remains +challenging. We aimed to develop a model using machine learning to provide +individual preoperative prediction of 5-year weight loss trajectories after +surgery. Methods In this multinational retrospective observational study we +enrolled adult participants (aged $\ge$18 years) from ten prospective cohorts +(including ABOS [NCT01129297], BAREVAL [NCT02310178], the Swedish Obese +Subjects study, and a large cohort from the Dutch Obesity Clinic [Nederlandse +Obesitas Kliniek]) and two randomised trials (SleevePass [NCT00793143] and +SM-BOSS [NCT00356213]) in Europe, the Americas, and Asia, with a 5 year +followup after Roux-en-Y gastric bypass, sleeve gastrectomy, or gastric band. +Patients with a previous history of bariatric surgery or large delays between +scheduled and actual visits were excluded. The training cohort comprised +patients from two centres in France (ABOS and BAREVAL). The primary outcome was +BMI at 5 years. A model was developed using least absolute shrinkage and +selection operator to select variables and the classification and regression +trees algorithm to build interpretable regression trees. The performances of +the model were assessed through the median absolute deviation (MAD) and root +mean squared error (RMSE) of BMI. Findings10 231 patients from 12 centres in +ten countries were included in the analysis, corresponding to 30 602 +patient-years. Among participants in all 12 cohorts, 7701 (75$\bullet$3%) were +female, 2530 (24$\bullet$7%) were male. Among 434 baseline attributes available +in the training cohort, seven variables were selected: height, weight, +intervention type, age, diabetes status, diabetes duration, and smoking status. +At 5 years, across external testing cohorts the overall mean MAD BMI was +2$\bullet$8 kg/m${}^2$ (95% CI 2$\bullet$6-3$\bullet$0) and mean RMSE BMI was +4$\bullet$7 kg/m${}^2$ (4$\bullet$4-5$\bullet$0), and the mean difference +between predicted and observed BMI was-0$\bullet$3 kg/m${}^2$ (SD 4$\bullet$7). +This model is incorporated in an easy to use and interpretable web-based +prediction tool to help inform clinical decision before surgery. +InterpretationWe developed a machine learning-based model, which is +internationally validated, for predicting individual 5-year weight loss +trajectories after three common bariatric interventions. + +
+
+ comment: The Lancet Digital Health, 2023 +
+
+
+
+
+ + ☆ CL-MAE: Curriculum-Learned Masked Autoencoders + + +
+ Masked image modeling has been demonstrated as a powerful pretext task for +generating robust representations that can be effectively generalized across +multiple downstream tasks. Typically, this approach involves randomly masking +patches (tokens) in input images, with the masking strategy remaining unchanged +during training. In this paper, we propose a curriculum learning approach that +updates the masking strategy to continually increase the complexity of the +self-supervised reconstruction task. We conjecture that, by gradually +increasing the task complexity, the model can learn more sophisticated and +transferable representations. To facilitate this, we introduce a novel +learnable masking module that possesses the capability to generate masks of +different complexities, and integrate the proposed module into masked +autoencoders (MAE). Our module is jointly trained with the MAE, while adjusting +its behavior during training, transitioning from a partner to the MAE +(optimizing the same reconstruction loss) to an adversary (optimizing the +opposite loss), while passing through a neutral state. The transition between +these behaviors is smooth, being regulated by a factor that is multiplied with +the reconstruction loss of the masking module. The resulting training procedure +generates an easy-to-hard curriculum. We train our Curriculum-Learned Masked +Autoencoder (CL-MAE) on ImageNet and show that it exhibits superior +representation learning capabilities compared to MAE. The empirical results on +five downstream tasks confirm our conjecture, demonstrating that curriculum +learning can be successfully used to self-supervise masked autoencoders. + +
+
+
+
+
+ + ☆ Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based + Approach + + +
+ In the rapidly evolving digital era, the analysis of document layouts plays a +pivotal role in automated information extraction and interpretation. In our +work, we have trained MViTv2 transformer model architecture with cascaded mask +R-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from +a document. After training on 20365 document images for 36 epochs in a 3 phase +cycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work +extends beyond training, delving into the exploration of potential enhancement +avenues. We investigate the impact of rotation and flip augmentation, the +effectiveness of slicing input images pre-inference, the implications of +varying the resolution of the transformer backbone, and the potential of +employing a dual-pass inference to uncover missed text-boxes. Through these +explorations, we observe a spectrum of outcomes, where some modifications +result in tangible performance improvements, while others offer unique insights +for future endeavors. + +
+
+
+
+
+ + ☆ MONDEO: Multistage Botnet Detection + + +
+ Mobile devices have widespread to become the most used piece of technology. +Due to their characteristics, they have become major targets for botnet-related +malware. FluBot is one example of botnet malware that infects mobile devices. +In particular, FluBot is a DNS-based botnet that uses Domain Generation +Algorithms (DGA) to establish communication with the Command and Control Server +(C2). MONDEO is a multistage mechanism with a flexible design to detect +DNS-based botnet malware. MONDEO is lightweight and can be deployed without +requiring the deployment of software, agents, or configuration in mobile +devices, allowing easy integration in core networks. MONDEO comprises four +detection stages: Blacklisting/Whitelisting, Query rate analysis, DGA analysis, +and Machine learning evaluation. It was created with the goal of processing +streams of packets to identify attacks with high efficiency, in the distinct +phases. MONDEO was tested against several datasets to measure its efficiency +and performance, being able to achieve high performance with RandomForest +classifiers. The implementation is available at github. + +
+
+
+
+
+ + ☆ Forecasting Emergency Department Crowding with Advanced Machine Learning + Models and Multivariable Input + + +
+ Emergency department (ED) crowding is a significant threat to patient safety +and it has been repeatedly associated with increased mortality. Forecasting +future service demand has the potential patient outcomes. Despite active +research on the subject, several gaps remain: 1) proposed forecasting models +have become outdated due to quick influx of advanced machine learning models +(ML), 2) amount of multivariable input data has been limited and 3) discrete +performance metrics have been rarely reported. In this study, we document the +performance of a set of advanced ML models in forecasting ED occupancy 24 hours +ahead. We use electronic health record data from a large, combined ED with an +extensive set of explanatory variables, including the availability of beds in +catchment area hospitals, traffic data from local observation stations, weather +variables, etc. We show that N-BEATS and LightGBM outpeform benchmarks with 11 +% and 9 % respective improvements and that DeepAR predicts next day crowding +with an AUC of 0.76 (95 % CI 0.69-0.84). To the best of our knowledge, this is +the first study to document the superiority of LightGBM and N-BEATS over +statistical benchmarks in the context of ED forecasting. + +
+
+
+
+
+ + ☆ Scalable Incomplete Multi-View Clustering with Structure Alignment + + +
+ The success of existing multi-view clustering (MVC) relies on the assumption +that all views are complete. However, samples are usually partially available +due to data corruption or sensor malfunction, which raises the research of +incomplete multi-view clustering (IMVC). Although several anchor-based IMVC +methods have been proposed to process the large-scale incomplete data, they +still suffer from the following drawbacks: i) Most existing approaches neglect +the inter-view discrepancy and enforce cross-view representation to be +consistent, which would corrupt the representation capability of the model; ii) +Due to the samples disparity between different views, the learned anchor might +be misaligned, which we referred as the Anchor-Unaligned Problem for Incomplete +data (AUP-ID). Such the AUP-ID would cause inaccurate graph fusion and degrades +clustering performance. To tackle these issues, we propose a novel incomplete +anchor graph learning framework termed Scalable Incomplete Multi-View +Clustering with Structure Alignment (SIMVC-SA). Specially, we construct the +view-specific anchor graph to capture the complementary information from +different views. In order to solve the AUP-ID, we propose a novel structure +alignment module to refine the cross-view anchor correspondence. Meanwhile, the +anchor graph construction and alignment are jointly optimized in our unified +framework to enhance clustering quality. Through anchor graph construction +instead of full graphs, the time and space complexity of the proposed SIMVC-SA +is proven to be linearly correlated with the number of samples. Extensive +experiments on seven incomplete benchmark datasets demonstrate the +effectiveness and efficiency of our proposed method. Our code is publicly +available at https://github.com/wy1019/SIMVC-SA. + +
+
+
+
+
+ + ☆ On a Connection between Differential Games, Optimal Control, and + Energy-based Models for Multi-Agent Interactions ICML 2023 + + +
+ Game theory offers an interpretable mathematical framework for modeling +multi-agent interactions. However, its applicability in real-world robotics +applications is hindered by several challenges, such as unknown agents' +preferences and goals. To address these challenges, we show a connection +between differential games, optimal control, and energy-based models and +demonstrate how existing approaches can be unified under our proposed +Energy-based Potential Game formulation. Building upon this formulation, this +work introduces a new end-to-end learning application that combines neural +networks for game-parameter inference with a differentiable game-theoretic +optimization layer, acting as an inductive bias. The experiments using +simulated mobile robot pedestrian interactions and real-world automated driving +data provide empirical evidence that the game-theoretic layer improves the +predictive performance of various neural network backbones. + +
+
+ comment: International Conference on Machine Learning, Workshop on New + Frontiers in Learning, Control, and Dynamical Systems (ICML 2023 + Frontiers4LCD) +
+
+
+
+
+ + ☆ Conditioning Score-Based Generative Models by Neuro-Symbolic Constraints + + +
+ Score-based and diffusion models have emerged as effective approaches for +both conditional and unconditional generation. Still conditional generation is +based on either a specific training of a conditional model or classifier +guidance, which requires training a noise-dependent classifier, even when the +classifier for uncorrupted data is given. We propose an approach to sample from +unconditional score-based generative models enforcing arbitrary logical +constraints, without any additional training. Firstly, we show how to +manipulate the learned score in order to sample from an un-normalized +distribution conditional on a user-defined constraint. Then, we define a +flexible and numerically stable neuro-symbolic framework for encoding soft +logical constraints. Combining these two ingredients we obtain a general, but +approximate, conditional sampling algorithm. We further developed effective +heuristics aimed at improving the approximation. Finally, we show the +effectiveness of our approach for various types of constraints and data: +tabular data, images and time series. + +
+
+
+
+
+ + ☆ SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded + Objects + + +
+ To enable meaningful robotic manipulation of objects in the real-world, 6D +pose estimation is one of the critical aspects. Most existing approaches have +difficulties to extend predictions to scenarios where novel object instances +are continuously introduced, especially with heavy occlusions. In this work, we +propose a few-shot pose estimation (FSPE) approach called SA6D, which uses a +self-adaptive segmentation module to identify the novel target object and +construct a point cloud model of the target object using only a small number of +cluttered reference images. Unlike existing methods, SA6D does not require +object-centric reference images or any additional object information, making it +a more generalizable and scalable solution across categories. We evaluate SA6D +on real-world tabletop object datasets and demonstrate that SA6D outperforms +existing FSPE methods, particularly in cluttered scenes with occlusions, while +requiring fewer reference images. + +
+
+
+
+
+ + ☆ Curvature-based Pooling within Graph Neural Networks ECML + + +
+ Over-squashing and over-smoothing are two critical issues, that limit the +capabilities of graph neural networks (GNNs). While over-smoothing eliminates +the differences between nodes making them indistinguishable, over-squashing +refers to the inability of GNNs to propagate information over long distances, +as exponentially many node states are squashed into fixed-size representations. +Both phenomena share similar causes, as both are largely induced by the graph +topology. To mitigate these problems in graph classification tasks, we propose +CurvPool, a novel pooling method. CurvPool exploits the notion of curvature of +a graph to adaptively identify structures responsible for both over-smoothing +and over-squashing. By clustering nodes based on the Balanced Forman curvature, +CurvPool constructs a graph with a more suitable structure, allowing deeper +models and the combination of distant information. We compare it to other +state-of-the-art pooling approaches and establish its competitiveness in terms +of classification accuracy, computational complexity, and flexibility. CurvPool +outperforms several comparable methods across all considered tasks. The most +consistent results are achieved by pooling densely connected clusters using the +sum aggregation, as this allows additional information about the size of each +pool. + +
+
+ comment: ECMLPKDD 2023 - Workshop on Mining and Learning with Graphs +
+
+
+
+
+ + ☆ In-class Data Analysis Replications: Teaching Students while Testing + Science + + +
+ Science is facing a reproducibility crisis. Previous work has proposed +incorporating data analysis replications into classrooms as a potential +solution. However, despite the potential benefits, it is unclear whether this +approach is feasible, and if so, what the involved stakeholders-students, +educators, and scientists-should expect from it. Can students perform a data +analysis replication over the course of a class? What are the costs and +benefits for educators? And how can this solution help benchmark and improve +the state of science? + In the present study, we incorporated data analysis replications in the +project component of the Applied Data Analysis course (CS-401) taught at EPFL +(N=354 students). Here we report pre-registered findings based on surveys +administered throughout the course. First, we demonstrate that students can +replicate previously published scientific papers, most of them qualitatively +and some exactly. We find discrepancies between what students expect of data +analysis replications and what they experience by doing them along with changes +in expectations about reproducibility, which together serve as evidence of +attitude shifts to foster students' critical thinking. Second, we provide +information for educators about how much overhead is needed to incorporate +replications into the classroom and identify concerns that replications bring +as compared to more traditional assignments. Third, we identify tangible +benefits of the in-class data analysis replications for scientific communities, +such as a collection of replication reports and insights about replication +barriers in scientific work that should be avoided going forward. + Overall, we demonstrate that incorporating replication tasks into a large +data science class can increase the reproducibility of scientific work as a +by-product of data science instruction, thus benefiting both science and +students. + +
+
+
+
+
+ + ☆ Latent Painter + + +
+ Latent diffusers revolutionized the generative AI and inspired creative art. +When denoising the latent, the predicted original image at each step +collectively animates the formation. However, the animation is limited by the +denoising nature of the diffuser, and only renders a sharpening process. This +work presents Latent Painter, which uses the latent as the canvas, and the +diffuser predictions as the plan, to generate painting animation. Latent +Painter also transits one generated image to another, which can happen between +images from two different sets of checkpoints. + +
+
+
+
+
+ + ☆ Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning + + +
+ Affordable 3D scanners often produce sparse and non-uniform point clouds that +negatively impact downstream applications in robotic systems. While existing +point cloud upsampling architectures have demonstrated promising results on +standard benchmarks, they tend to experience significant performance drops when +the test data have different distributions from the training data. To address +this issue, this paper proposes a test-time adaption approach to enhance model +generality of point cloud upsampling. The proposed approach leverages +meta-learning to explicitly learn network parameters for test-time adaption. +Our method does not require any prior information about the test data. During +meta-training, the model parameters are learned from a collection of +instance-level tasks, each of which consists of a sparse-dense pair of point +clouds from the training data. During meta-testing, the trained model is +fine-tuned with a few gradient updates to produce a unique set of network +parameters for each test instance. The updated model is then used for the final +prediction. Our framework is generic and can be applied in a plug-and-play +manner with existing backbone networks in point cloud upsampling. Extensive +experiments demonstrate that our approach improves the performance of +state-of-the-art models. + +
+
+
+
+
+ + ☆ Echocardiographic View Classification with Integrated + Out-of-Distribution Detection for Enhanced Automatic Echocardiographic + Analysis + + +
+ In the rapidly evolving field of automatic echocardiographic analysis and +interpretation, automatic view classification is a critical yet challenging +task, owing to the inherent complexity and variability of echocardiographic +data. This study presents ECHOcardiography VIew Classification with +Out-of-Distribution dEtection (ECHO-VICODE), a novel deep learning-based +framework that effectively addresses this challenge by training to classify 31 +classes, surpassing previous studies and demonstrating its capacity to handle a +wide range of echocardiographic views. Furthermore, ECHO-VICODE incorporates an +integrated out-of-distribution (OOD) detection function, leveraging the +relative Mahalanobis distance to effectively identify 'near-OOD' instances +commonly encountered in echocardiographic data. Through extensive +experimentation, we demonstrated the outstanding performance of ECHO-VICODE in +terms of view classification and OOD detection, significantly reducing the +potential for errors in echocardiographic analyses. This pioneering study +significantly advances the domain of automated echocardiography analysis and +exhibits promising prospects for substantial applications in extensive clinical +research and practice. + +
+
+
+
+
+ + ☆ Point-TTA: Test-Time Adaptation for Point Cloud Registration Using + Multitask Meta-Auxiliary Learning + + +
+ We present Point-TTA, a novel test-time adaptation framework for point cloud +registration (PCR) that improves the generalization and the performance of +registration models. While learning-based approaches have achieved impressive +progress, generalization to unknown testing environments remains a major +challenge due to the variations in 3D scans. Existing methods typically train a +generic model and the same trained model is applied on each instance during +testing. This could be sub-optimal since it is difficult for the same model to +handle all the variations during testing. In this paper, we propose a test-time +adaptation approach for PCR. Our model can adapt to unseen distributions at +test-time without requiring any prior knowledge of the test data. Concretely, +we design three self-supervised auxiliary tasks that are optimized jointly with +the primary PCR task. Given a test instance, we adapt our model using these +auxiliary tasks and the updated model is used to perform the inference. During +training, our model is trained using a meta-auxiliary learning approach, such +that the adapted model via auxiliary tasks improves the accuracy of the primary +task. Experimental results demonstrate the effectiveness of our approach in +improving generalization of point cloud registration and outperforming other +state-of-the-art approaches. + +
+
+
+
+
+ + ☆ A Policy Adaptation Method for Implicit Multitask Reinforcement Learning + Problems + + +
+ In dynamic motion generation tasks, including contact and collisions, small +changes in policy parameters can lead to extremely different returns. For +example, in soccer, the ball can fly in completely different directions with a +similar heading motion by slightly changing the hitting position or the force +applied to the ball or when the friction of the ball varies. However, it is +difficult to imagine that completely different skills are needed for heading a +ball in different directions. In this study, we proposed a multitask +reinforcement learning algorithm for adapting a policy to implicit changes in +goals or environments in a single motion category with different reward +functions or physical parameters of the environment. We evaluated the proposed +method on the ball heading task using a monopod robot model. The results showed +that the proposed method can adapt to implicit changes in the goal positions or +the coefficients of restitution of the ball, whereas the standard domain +randomization approach cannot cope with different task settings. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ Domain-adaptive Message Passing Graph Neural Network + + +
+ Cross-network node classification (CNNC), which aims to classify nodes in a +label-deficient target network by transferring the knowledge from a source +network with abundant labels, draws increasing attention recently. To address +CNNC, we propose a domain-adaptive message passing graph neural network +(DM-GNN), which integrates graph neural network (GNN) with conditional +adversarial domain adaptation. DM-GNN is capable of learning informative +representations for node classification that are also transferrable across +networks. Firstly, a GNN encoder is constructed by dual feature extractors to +separate ego-embedding learning from neighbor-embedding learning so as to +jointly capture commonality and discrimination between connected nodes. +Secondly, a label propagation node classifier is proposed to refine each node's +label prediction by combining its own prediction and its neighbors' prediction. +In addition, a label-aware propagation scheme is devised for the labeled source +network to promote intra-class propagation while avoiding inter-class +propagation, thus yielding label-discriminative source embeddings. Thirdly, +conditional adversarial domain adaptation is performed to take the +neighborhood-refined class-label information into account during adversarial +domain adaptation, so that the class-conditional distributions across networks +can be better matched. Comparisons with eleven state-of-the-art methods +demonstrate the effectiveness of the proposed DM-GNN. + +
+
+
+
+
+ + ☆ Computing excited states of molecules using normalizing flows + + +
+ We present a new nonlinear variational framework for simultaneously computing +ground and excited states of quantum systems. Our approach is based on +approximating wavefunctions in the linear span of basis functions that are +augmented and optimized \emph{via} composition with normalizing flows. The +accuracy and efficiency of our approach are demonstrated in the calculations of +a large number of vibrational states of the triatomic H$_2$S molecule as well +as ground and several excited electronic states of prototypical one-electron +systems including the hydrogen atom, the molecular hydrogen ion, and a carbon +atom in a single-active-electron approximation. The results demonstrate +significant improvements in the accuracy of energy predictions and accelerated +basis-set convergence even when using normalizing flows with a small number of +parameters. The present approach can be also seen as the optimization of a set +of intrinsic coordinates that best capture the underlying physics within the +given basis set. + +
+
+
+
+
+ + ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained language models like ChatGPT have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks. Moreover, in bioinformatics, generating +functional programs poses additional notable challenges due to the amount of +domain knowledge, the need for complicated data operations, and intricate +functional dependencies between the operations. Here, we present BioCoder, a +benchmark developed to evaluate existing pre-trained models in generating +bioinformatics code. In relation to function-code generation, BioCoder covers +potential package dependencies, class declarations, and global variables. It +incorporates 1026 functions and 1243 methods in Python and Java from GitHub and +253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing +framework for evaluation, and we have applied it to evaluate many models +including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, +InstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes +the importance of domain knowledge, pragmatic code generation, and contextual +understanding. Our dataset, benchmark, Docker images, and scripts required for +testing are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ☆ Least Squares Maximum and Weighted Generalization-Memorization Machines + + +
+ In this paper, we propose a new way of remembering by introducing a memory +influence mechanism for the least squares support vector machine (LSSVM). +Without changing the equation constraints of the original LSSVM, this +mechanism, allows an accurate partitioning of the training set without +overfitting. The maximum memory impact model (MIMM) and the weighted impact +memory model (WIMM) are then proposed. It is demonstrated that these models can +be degraded to the LSSVM. Furthermore, we propose some different memory impact +functions for the MIMM and WIMM. The experimental results show that that our +MIMM and WIMM have better generalization performance compared to the LSSVM and +significant advantage in time cost compared to other memory models. + +
+
+
+
+
+ + ☆ Adversarial Finetuning with Latent Representation Constraint to Mitigate + Accuracy-Robustness Tradeoff ICCV + + +
+ This paper addresses the tradeoff between standard accuracy on clean examples +and robustness against adversarial examples in deep neural networks (DNNs). +Although adversarial training (AT) improves robustness, it degrades the +standard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we +propose a novel AT method called ARREST, which comprises three components: (i) +adversarial finetuning (AFT), (ii) representation-guided knowledge distillation +(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples +by initializing its parameters with a DNN that is standardly pretrained on +clean examples. RGKD and NR respectively entail a regularization term and an +algorithm to preserve latent representations of clean examples during AFT. RGKD +penalizes the distance between the representations of the standardly pretrained +and AFT DNNs. NR switches input adversarial examples to nonadversarial ones +when the representation changes significantly during AFT. By combining these +components, ARREST achieves both high standard accuracy and robustness. +Experimental results demonstrate that ARREST mitigates the tradeoff more +effectively than previous AT-based methods do. + +
+
+ comment: Accepted by International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Listen to Minority: Encrypted Traffic Classification for Class Imbalance + with Contrastive Pre-Training + + +
+ Mobile Internet has profoundly reshaped modern lifestyles in various aspects. +Encrypted Traffic Classification (ETC) naturally plays a crucial role in +managing mobile Internet, especially with the explosive growth of mobile apps +using encrypted communication. Despite some existing learning-based ETC methods +showing promising results, three-fold limitations still remain in real-world +network environments, 1) label bias caused by traffic class imbalance, 2) +traffic homogeneity caused by component sharing, and 3) training with reliance +on sufficient labeled traffic. None of the existing ETC methods can address all +these limitations. In this paper, we propose a novel Pre-trAining +Semi-Supervised ETC framework, dubbed PASS. Our key insight is to resample the +original train dataset and perform contrastive pre-training without using +individual app labels directly to avoid label bias issues caused by class +imbalance, while obtaining a robust feature representation to differentiate +overlapping homogeneous traffic by pulling positive traffic pairs closer and +pushing negative pairs away. Meanwhile, PASS designs a semi-supervised +optimization strategy based on pseudo-label iteration and dynamic loss +weighting algorithms in order to effectively utilize massive unlabeled traffic +data and alleviate manual train dataset annotation workload. PASS outperforms +state-of-the-art ETC methods and generic sampling approaches on four public +datasets with significant class imbalance and traffic homogeneity, remarkably +pushing the F1 of Cross-Platform215 with 1.31%, ISCX-17 with 9.12%. +Furthermore, we validate the generality of the contrastive pre-training and +pseudo-label iteration components of PASS, which can adaptively benefit ETC +methods with diverse feature extractors. + +
+
+ comment: Accepted by 2023 IEEE SECON, 9 pages, 6 figures +
+
+
+
+
+ + ☆ AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR + Prediction + + +
+ Click-through rate (CTR) prediction is a crucial issue in recommendation +systems. There has been an emergence of various public CTR datasets. However, +existing datasets primarily suffer from the following limitations. Firstly, +users generally click different types of items from multiple scenarios, and +modeling from multiple scenarios can provide a more comprehensive understanding +of users. Existing datasets only include data for the same type of items from a +single scenario. Secondly, multi-modal features are essential in multi-scenario +prediction as they address the issue of inconsistent ID encoding between +different scenarios. The existing datasets are based on ID features and lack +multi-modal features. Third, a large-scale dataset can provide a more reliable +evaluation of models, fully reflecting the performance differences between +models. The scale of existing datasets is around 100 million, which is +relatively small compared to the real-world CTR prediction. To address these +limitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset +based on industrial data from Alipay. Specifically, AntM$^{2}$C provides the +following advantages: 1) It covers CTR data of 5 different types of items, +providing insights into the preferences of users for different items, including +advertisements, vouchers, mini-programs, contents, and videos. 2) Apart from +ID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text +and image features, which can effectively establish connections between items +with different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200 +features, including 200 million users and 6 million items. It is currently the +largest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several +typical CTR tasks and provide comparisons with baseline methods. The dataset +homepage is available at https://www.atecup.cn/home. + +
+
+
+
+
+ + ☆ On the Equivalence between Implicit and Explicit Neural Networks: A + High-dimensional Viewpoint ICML + 2023 + + +
+ Implicit neural networks have demonstrated remarkable success in various +tasks. However, there is a lack of theoretical analysis of the connections and +differences between implicit and explicit networks. In this paper, we study +high-dimensional implicit neural networks and provide the high dimensional +equivalents for the corresponding conjugate kernels and neural tangent kernels. +Built upon this, we establish the equivalence between implicit and explicit +networks in high dimensions. + +
+
+ comment: Accepted by Workshop on High-dimensional Learning Dynamics, ICML + 2023, Honolulu, Hawaii +
+
+
+
+
+ + ☆ DECODE: DilatEd COnvolutional neural network for Detecting + Extreme-mass-ratio inspirals + + +
+ The detection of Extreme Mass Ratio Inspirals (EMRIs) is intricate due to +their complex waveforms, extended duration, and low signal-to-noise ratio +(SNR), making them more challenging to be identified compared to compact binary +coalescences. While matched filtering-based techniques are known for their +computational demands, existing deep learning-based methods primarily handle +time-domain data and are often constrained by data duration and SNR. In +addition, most existing work ignores time-delay interferometry (TDI) and +applies the long-wavelength approximation in detector response calculations, +thus limiting their ability to handle laser frequency noise. In this study, we +introduce DECODE, an end-to-end model focusing on EMRI signal detection by +sequence modeling in the frequency domain. Centered around a dilated causal +convolutional neural network, trained on synthetic data considering TDI-1.5 +detector response, DECODE can efficiently process a year's worth of +multichannel TDI data with an SNR of around 50. We evaluate our model on 1-year +data with accumulated SNR ranging from 50 to 120 and achieve a true positive +rate of 96.3% at a false positive rate of 1%, keeping an inference time of less +than 0.01 seconds. With the visualization of three showcased EMRI signals for +interpretability and generalization, DECODE exhibits strong potential for +future space-based gravitational wave data analyses. + +
+
+
+
+
+ + ☆ CktGNN: Circuit Graph Neural Network for Electronic Design Automation ICLR + + +
+ The electronic design automation of analog circuits has been a longstanding +challenge in the integrated circuit field due to the huge design space and +complex design trade-offs among circuit specifications. In the past decades, +intensive research efforts have mostly been paid to automate the transistor +sizing with a given circuit topology. By recognizing the graph nature of +circuits, this paper presents a Circuit Graph Neural Network (CktGNN) that +simultaneously automates the circuit topology generation and device sizing +based on the encoder-dependent optimization subroutines. Particularly, CktGNN +encodes circuit graphs using a two-level GNN framework (of nested GNN) where +circuits are represented as combinations of subgraphs in a known subgraph +basis. In this way, it significantly improves design efficiency by reducing the +number of subgraphs to perform message passing. Nonetheless, another critical +roadblock to advancing learning-assisted circuit design automation is a lack of +public benchmarks to perform canonical assessment and reproducible research. To +tackle the challenge, we introduce Open Circuit Benchmark (OCB), an +open-sourced dataset that contains $10$K distinct operational amplifiers with +carefully-extracted circuit specifications. OCB is also equipped with +communicative circuit generation and evaluation capabilities such that it can +help to generalize CktGNN to design various analog circuits by producing +corresponding datasets. Experiments on OCB show the extraordinary advantages of +CktGNN through representation-based optimization frameworks over other recent +powerful GNN baselines and human experts' manual designs. Our work paves the +way toward a learning-based open-sourced design automation for analog circuits. +Our source code is available at \url{https://github.com/zehao-dong/CktGNN}. + +
+
+ comment: Accepted by ICLR (International Conference on Learning + Representations) 2023 +
+
+
+
+
+ + ☆ Balancing between the Local and Global Structures (LGS) in Graph + Embedding + + +
+ We present a method for balancing between the Local and Global Structures +(LGS) in graph embedding, via a tunable parameter. Some embedding methods aim +to capture global structures, while others attempt to preserve local +neighborhoods. Few methods attempt to do both, and it is not always possible to +capture well both local and global information in two dimensions, which is +where most graph drawing live. The choice of using a local or a global +embedding for visualization depends not only on the task but also on the +structure of the underlying data, which may not be known in advance. For a +given graph, LGS aims to find a good balance between the local and global +structure to preserve. We evaluate the performance of LGS with synthetic and +real-world datasets and our results indicate that it is competitive with the +state-of-the-art methods, using established quality metrics such as stress and +neighborhood preservation. We introduce a novel quality metric, cluster +distance preservation, to assess intermediate structure capture. All +source-code, datasets, experiments and analysis are available online. + +
+
+ comment: Appears in the Proceedings of the 31st International Symposium on + Graph Drawing and Network Visualization (GD 2023) +
+
+
+
+
+ + ☆ Improving Robustness and Accuracy of Ponzi Scheme Detection on Ethereum + Using Time-Dependent Features + + +
+ The rapid development of blockchain has led to more and more funding pouring +into the cryptocurrency market, which also attracted cybercriminals' interest +in recent years. The Ponzi scheme, an old-fashioned fraud, is now popular on +the blockchain, causing considerable financial losses to many crypto-investors. +A few Ponzi detection methods have been proposed in the literature, most of +which detect a Ponzi scheme based on its smart contract source code or opcode. +The contract-code-based approach, while achieving very high accuracy, is not +robust: first, the source codes of a majority of contracts on Ethereum are not +available, and second, a Ponzi developer can fool a contract-code-based +detection model by obfuscating the opcode or inventing a new profit +distribution logic that cannot be detected (since these models were trained on +existing Ponzi logics only). A transaction-based approach could improve the +robustness of detection because transactions, unlike smart contracts, are +harder to be manipulated. However, the current transaction-based detection +models achieve fairly low accuracy. We address this gap in the literature by +developing new detection models that rely only on the transactions, hence +guaranteeing the robustness, and moreover, achieve considerably higher +Accuracy, Precision, Recall, and F1-score than existing transaction-based +models. This is made possible thanks to the introduction of novel +time-dependent features that capture Ponzi behaviours characteristics derived +from our comprehensive data analyses on Ponzi and non-Ponzi data from the +XBlock-ETH repository + +
+
+ comment: 17 pages, 9 figures, 4 tables +
+
+
+
+
+ + ☆ BenchTemp: A General Benchmark for Evaluating Temporal Graph Neural + Networks + + +
+ To handle graphs in which features or connectivities are evolving over time, +a series of temporal graph neural networks (TGNNs) have been proposed. Despite +the success of these TGNNs, the previous TGNN evaluations reveal several +limitations regarding four critical issues: 1) inconsistent datasets, 2) +inconsistent evaluation pipelines, 3) lacking workload diversity, and 4) +lacking efficient comparison. Overall, there lacks an empirical study that puts +TGNN models onto the same ground and compares them comprehensively. To this +end, we propose BenchTemp, a general benchmark for evaluating TGNN models on +various workloads. BenchTemp provides a set of benchmark datasets so that +different TGNN models can be fairly compared. Further, BenchTemp engineers a +standard pipeline that unifies the TGNN evaluation. With BenchTemp, we +extensively compare the representative TGNN models on different tasks (e.g., +link prediction and node classification) and settings (transductive and +inductive), w.r.t. both effectiveness and efficiency metrics. We have made +BenchTemp publicly available at https://github.com/qianghuangwhu/benchtemp. + +
+
+ comment: 28 pages, 23 figures, 27 tables. Submitted to the Conference on + Neural Information Processing Systems 2023 Track on Datasets and Benchmarks +
+
+
+
+
+ + ☆ Multi-Objective Decision Transformers for Offline Reinforcement Learning + + +
+ Offline Reinforcement Learning (RL) is structured to derive policies from +static trajectory data without requiring real-time environment interactions. +Recent studies have shown the feasibility of framing offline RL as a sequence +modeling task, where the sole aim is to predict actions based on prior context +using the transformer architecture. However, the limitation of this single task +learning approach is its potential to undermine the transformer model's +attention mechanism, which should ideally allocate varying attention weights +across different tokens in the input context for optimal prediction. To address +this, we reformulate offline RL as a multi-objective optimization problem, +where the prediction is extended to states and returns. We also highlight a +potential flaw in the trajectory representation used for sequence modeling, +which could generate inaccuracies when modeling the state and return +distributions. This is due to the non-smoothness of the action distribution +within the trajectory dictated by the behavioral policy. To mitigate this +issue, we introduce action space regions to the trajectory representation. Our +experiments on D4RL benchmark locomotion tasks reveal that our propositions +allow for more effective utilization of the attention mechanism in the +transformer model, resulting in performance that either matches or outperforms +current state-of-the art methods. + +
+
+
+
+
+ + ☆ A Survey on Privacy in Graph Neural Networks: Attacks, Preservation, and + Applications + + +
+ Graph Neural Networks (GNNs) have gained significant attention owing to their +ability to handle graph-structured data and the improvement in practical +applications. However, many of these models prioritize high utility +performance, such as accuracy, with a lack of privacy consideration, which is a +major concern in modern society where privacy attacks are rampant. To address +this issue, researchers have started to develop privacy-preserving GNNs. +Despite this progress, there is a lack of a comprehensive overview of the +attacks and the techniques for preserving privacy in the graph domain. In this +survey, we aim to address this gap by summarizing the attacks on graph data +according to the targeted information, categorizing the privacy preservation +techniques in GNNs, and reviewing the datasets and applications that could be +used for analyzing/solving privacy issues in GNNs. We also outline potential +directions for future research in order to build better privacy-preserving +GNNs. + +
+
+
+
+
+ + ☆ SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked + Prefills + + +
+ Large Language Model (LLM) inference consists of two distinct phases - +prefill phase which processes the input prompt and decode phase which generates +output tokens autoregressively. While the prefill phase effectively saturates +GPU compute at small batch sizes, the decode phase results in low compute +utilization as it generates one token at a time per request. The varying +prefill and decode times also lead to imbalance across micro-batches when using +pipeline parallelism, resulting in further inefficiency due to bubbles. + We present SARATHI to address these challenges. SARATHI employs +chunked-prefills, which splits a prefill request into equal sized chunks, and +decode-maximal batching, which constructs a batch using a single prefill chunk +and populates the remaining slots with decodes. During inference, the prefill +chunk saturates GPU compute, while the decode requests 'piggyback' and cost up +to an order of magnitude less compared to a decode-only batch. Chunked-prefills +allows constructing multiple decode-maximal batches from a single prefill +request, maximizing coverage of decodes that can piggyback. Furthermore, the +uniform compute design of these batches ameliorates the imbalance between +micro-batches, significantly reducing pipeline bubbles. + Our techniques yield significant improvements in inference performance across +models and hardware. For the LLaMA-13B model on A6000 GPU, SARATHI improves +decode throughput by up to 10x, and accelerates end-to-end throughput by up to +1.33x. For LLaMa-33B on A100 GPU, we achieve 1.25x higher end-to-end-throughput +and up to 4.25x higher decode throughput. When used with pipeline parallelism +on GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end +throughput improvement of 1.91x. + +
+
+
+
+
+ + ♻ ☆ Diffusion Policies for Out-of-Distribution Generalization in Offline + Reinforcement Learning + + +
+ Offline Reinforcement Learning (RL) methods leverage previous experiences to +learn better policies than the behavior policy used for data collection. In +contrast to behavior cloning, which assumes the data is collected from expert +demonstrations, offline RL can work with non-expert data and multimodal +behavior policies. However, offline RL algorithms face challenges in handling +distribution shifts and effectively representing policies due to the lack of +online interaction during training. Prior work on offline RL uses conditional +diffusion models to represent multimodal behavior in the dataset. Nevertheless, +these methods are not tailored toward alleviating the out-of-distribution state +generalization. We introduce a novel method, named State Reconstruction for +Diffusion Policies (SRDP), incorporating state reconstruction feature learning +in the recent class of diffusion policies to address the out-of-distribution +generalization problem. State reconstruction loss promotes more descriptive +representation learning of states to alleviate the distribution shift incurred +by the out-of-distribution (OOD) states. We design a novel 2D Multimodal +Contextual Bandit environment to illustrate the OOD generalization of SRDP +compared to prior algorithms. In addition, we assess the performance of our +model on D4RL continuous control benchmarks, namely the navigation of an 8-DoF +ant and forward locomotion of half-cheetah, hopper, and walker2d, achieving +state-of-the-art results. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ StyleGAN as a Utility-Preserving Face De-identification Method + + +
+ Face de-identification methods have been proposed to preserve users' privacy +by obscuring their faces. These methods, however, can degrade the quality of +photos, and they usually do not preserve the utility of faces, i.e., their age, +gender, pose, and facial expression. Recently, GANs, such as StyleGAN, have +been proposed, which generate realistic, high-quality imaginary faces. In this +paper, we investigate the use of StyleGAN in generating de-identified faces +through style mixing. We examined this de-identification method for preserving +utility and privacy by implementing several face detection, verification, and +identification attacks and conducting a user study. The results from our +extensive experiments, human evaluation, and comparison with two +state-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN +performs on par or better than these methods, preserving users' privacy and +images' utility. In particular, the results of the machine learning-based +experiments show that StyleGAN0-4 preserves utility better than CIAGAN and +DeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves +utility at the same level while providing more privacy. In this paper, for the +first time, we also performed a carefully designed user study to examine both +privacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well +as CIAGAN and DeepPrivacy from the human observers' perspectives. Our +statistical tests showed that participants tend to verify and identify +StyleGAN0-5 images more easily than DeepPrivacy images. All the methods but +StyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding +utility, as expected, StyleGAN0-5 performed significantly better in preserving +some attributes. Among all methods, on average, participants believe gender has +been preserved the most while naturalness has been preserved the least. + +
+
+
+
+
+ + ♻ ☆ Seeking Interpretability and Explainability in Binary Activated Neural + Networks + + +
+ We study the use of binary activated neural networks as interpretable and +explainable predictors in the context of regression tasks on tabular data; more +specifically, we provide guarantees on their expressiveness, present an +approach based on the efficient computation of SHAP values for quantifying the +relative importance of the features, hidden neurons and even weights. As the +model's simplicity is instrumental in achieving interpretability, we propose a +greedy algorithm for building compact binary activated networks. This approach +doesn't need to fix an architecture for the network in advance: it is built one +layer at a time, one neuron at a time, leading to predictors that aren't +needlessly complex for a given task. + +
+
+
+
+
+ + ♻ ☆ Dynamical systems' based neural networks + + +
+ Neural networks have gained much interest because of their effectiveness in +many applications. However, their mathematical properties are generally not +well understood. If there is some underlying geometric structure inherent to +the data or to the function to approximate, it is often desirable to take this +into account in the design of the neural network. In this work, we start with a +non-autonomous ODE and build neural networks using a suitable, +structure-preserving, numerical time-discretisation. The structure of the +neural network is then inferred from the properties of the ODE vector field. +Besides injecting more structure into the network architectures, this modelling +procedure allows a better theoretical understanding of their behaviour. We +present two universal approximation results and demonstrate how to impose some +particular properties on the neural networks. A particular focus is on +1-Lipschitz architectures including layers that are not 1-Lipschitz. These +networks are expressive and robust against adversarial attacks, as shown for +the CIFAR-10 and CIFAR-100 datasets. + +
+
+
+
+
+ + ♻ ☆ Hypergraph Structure Inference From Data Under Smoothness Prior + + +
+ Hypergraphs are important for processing data with higher-order relationships +involving more than two entities. In scenarios where explicit hypergraphs are +not readily available, it is desirable to infer a meaningful hypergraph +structure from the node features to capture the intrinsic relations within the +data. However, existing methods either adopt simple pre-defined rules that fail +to precisely capture the distribution of the potential hypergraph structure, or +learn a mapping between hypergraph structures and node features but require a +large amount of labelled data, i.e., pre-existing hypergraph structures, for +training. Both restrict their applications in practical scenarios. To fill this +gap, we propose a novel smoothness prior that enables us to design a method to +infer the probability for each potential hyperedge without labelled data as +supervision. The proposed prior indicates features of nodes in a hyperedge are +highly correlated by the features of the hyperedge containing them. We use this +prior to derive the relation between the hypergraph structure and the node +features via probabilistic modelling. This allows us to develop an unsupervised +inference method to estimate the probability for each potential hyperedge via +solving an optimisation problem that has an analytical solution. Experiments on +both synthetic and real-world data demonstrate that our method can learn +meaningful hypergraph structures from data more efficiently than existing +hypergraph structure inference methods. + +
+
+
+
+
+ + ♻ ☆ Speeding up Fourier Neural Operators via Mixed Precision + + +
+ The Fourier neural operator (FNO) is a powerful technique for learning +surrogate maps for partial differential equation (PDE) solution operators. For +many real-world applications, which often require high-resolution data points, +training time and memory usage are significant bottlenecks. While there are +mixed-precision training techniques for standard neural networks, those work +for real-valued datatypes on finite dimensions and therefore cannot be directly +applied to FNO, which crucially operates in the (complex-valued) Fourier domain +and in function spaces. On the other hand, since the Fourier transform is +already an approximation (due to discretization error), we do not need to +perform the operation at full precision. In this work, we (i) profile memory +and runtime for FNO with full and mixed-precision training, (ii) conduct a +study on the numerical stability of mixed-precision training of FNO, and (iii) +devise a training routine which substantially decreases training time and +memory usage (up to 34%), with little or no reduction in accuracy, on the +Navier-Stokes and Darcy flow equations. Combined with the recently proposed +tensorized FNO (Kossaifi et al., 2023), the resulting model has far better +performance while also being significantly faster than the original FNO. + +
+
+
+
+
+ + ♻ ☆ Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave + Communications + + +
+ This study demonstrates the feasibility of point cloud-based proactive link +quality prediction for millimeter-wave (mmWave) communications. Previous +studies have proposed machine learning-based methods to predict received signal +strength for future time periods using time series of depth images to mitigate +the line-of-sight (LOS) path blockage by pedestrians in mmWave communication. +However, these image-based methods have limited applicability due to privacy +concerns as camera images may contain sensitive information. This study +proposes a point cloud-based method for mmWave link quality prediction and +demonstrates its feasibility through experiments. Point clouds represent +three-dimensional (3D) spaces as a set of points and are sparser and less +likely to contain sensitive information than camera images. Additionally, point +clouds provide 3D position and motion information, which is necessary for +understanding the radio propagation environment involving pedestrians. This +study designs the mmWave link quality prediction method and conducts realistic +indoor experiments, where the link quality fluctuates significantly due to +human blockage, using commercially available IEEE 802.11ad-based 60 GHz +wireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light +detection and ranging (LiDAR) for point cloud acquisition. The experimental +results showed that our proposed method can predict future large attenuation of +mmWave received signal strength and throughput induced by the LOS path blockage +by pedestrians with comparable or superior accuracy to image-based prediction +methods. Hence, our point cloud-based method can serve as a viable alternative +to image-based methods. + +
+
+ comment: Submitted to IEEE Transactions on Machine Learning in Communications + and Networking +
+
+
+
+
+ + ♻ ☆ Metropolitan Segment Traffic Speeds from Massive Floating Car Data in 10 + Cities + + +
+ Traffic analysis is crucial for urban operations and planning, while the +availability of dense urban traffic data beyond loop detectors is still scarce. +We present a large-scale floating vehicle dataset of per-street segment traffic +information, Metropolitan Segment Traffic Speeds from Massive Floating Car Data +in 10 Cities (MeTS-10), available for 10 global cities with a 15-minute +resolution for collection periods ranging between 108 and 361 days in 2019-2021 +and covering more than 1500 square kilometers per metropolitan area. MeTS-10 +features traffic speed information at all street levels from main arterials to +local streets for Antwerp, Bangkok, Barcelona, Berlin, Chicago, Istanbul, +London, Madrid, Melbourne and Moscow. The dataset leverages the +industrial-scale floating vehicle Traffic4cast data with speeds and vehicle +counts provided in a privacy-preserving spatio-temporal aggregation. We detail +the efficient matching approach mapping the data to the OpenStreetMap road +graph. We evaluate the dataset by comparing it with publicly available +stationary vehicle detector data (for Berlin, London, and Madrid) and the Uber +traffic speed dataset (for Barcelona, Berlin, and London). The comparison +highlights the differences across datasets in spatio-temporal coverage and +variations in the reported traffic caused by the binning method. MeTS-10 +enables novel, city-wide analysis of mobility and traffic patterns for ten +major world cities, overcoming current limitations of spatially sparse vehicle +detector data. The large spatial and temporal coverage offers an opportunity +for joining the MeTS-10 with other datasets, such as traffic surveys in traffic +planning studies or vehicle detector data in traffic control settings. + +
+
+ comment: Accepted by IEEE Transactions on Intelligent Transportation Systems + (T-ITS), DOI: https://doi.org/10.1109/TITS.2023.3291737 +
+
+
+
+
+ + ♻ ☆ Neural Mixed Effects for Nonlinear Personalized Predictions + + +
+ Personalized prediction is a machine learning approach that predicts a +person's future observations based on their past labeled observations and is +typically used for sequential tasks, e.g., to predict daily mood ratings. When +making personalized predictions, a model can combine two types of trends: (a) +trends shared across people, i.e., person-generic trends, such as being happier +on weekends, and (b) unique trends for each person, i.e., person-specific +trends, such as a stressful weekly meeting. Mixed effect models are popular +statistical models to study both trends by combining person-generic and +person-specific parameters. Though linear mixed effect models are gaining +popularity in machine learning by integrating them with neural networks, these +integrations are currently limited to linear person-specific parameters: ruling +out nonlinear person-specific trends. In this paper, we propose Neural Mixed +Effect (NME) models to optimize nonlinear person-specific parameters anywhere +in a neural network in a scalable manner. NME combines the efficiency of neural +network optimization with nonlinear mixed effects modeling. Empirically, we +observe that NME improves performance across six unimodal and multimodal +datasets, including a smartphone dataset to predict daily mood and a +mother-adolescent dataset to predict affective state sequences where half the +mothers experience at least moderate symptoms of depression. Furthermore, we +evaluate NME for two model architectures, including for neural conditional +random fields (CRF) to predict affective state sequences where the CRF learns +nonlinear person-specific temporal transitions between affective states. +Analysis of these person-specific transitions on the mother-adolescent dataset +shows interpretable trends related to the mother's depression symptoms. + +
+
+ comment: camera-ready version +
+
+
+
+
+ + ♻ ☆ Neural ShDF: Reviving an Efficient and Consistent Mesh Segmentation + Method SIGGRAPH 2023 + + +
+ Partitioning a polygonal mesh into meaningful parts can be challenging. Many +applications require decomposing such structures for further processing in +computer graphics. In the last decade, several methods were proposed to tackle +this problem, at the cost of intensive computational times. Recently, machine +learning has proven to be effective for the segmentation task on 3D structures. +Nevertheless, these state-of-the-art methods are often hardly generalizable and +require dividing the learned model into several specific classes of objects to +avoid overfitting. We present a data-driven approach leveraging deep learning +to encode a mapping function prior to mesh segmentation for multiple +applications. Our network reproduces a neighborhood map using our knowledge of +the \textsl{Shape Diameter Function} (SDF) method using similarities among +vertex neighborhoods. Our approach is resolution-agnostic as we downsample the +input meshes and query the full-resolution structure solely for neighborhood +contributions. Using our predicted SDF values, we can inject the resulting +structure into a graph-cut algorithm to generate an efficient and robust mesh +segmentation while considerably reducing the required computation times. + +
+
+ comment: 9 pages, 13 figures, and 3 tables. Short paper and poster published + and presented at SIGGRAPH 2023 +
+
+
+
+
+ + ♻ ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks, such as Reddit discussions. In contrast to traditional comment-only +methods, our approach to labelling a comment as hate speech involves a holistic +analysis of text and images grounded in the discussion context. This is done by +leveraging graph transformers to capture the contextual relationships in the +entire discussion surrounding a comment and grounding the interwoven fusion +layers that combine individual comments' text and image embeddings instead of +processing modalities separately. We compare the performance of our model to +baselines that only process individual comments and conduct extensive ablation +studies. To evaluate our work, we present a new dataset, HatefulDiscussions, +comprising complete multi-modal discussions from multiple online communities on +Reddit. We conclude with future work for multimodal solutions to deliver social +value in online contexts, arguing that capturing a holistic view of a +conversation significantly advances the effort to detect anti-social behaviour. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ♻ ☆ Biclustering Methods via Sparse Penalty + + +
+ In this paper, we first reviewed several biclustering methods that are used +to identify the most significant clusters in gene expression data. Here we +mainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty +named "Prenet penalty" which has been used only in factor analysis to gain +sparsity. Then in the simulation study, we tried different types of generated +datasets (with different sparsity and dimension) and tried 1-layer +approximation then for k-layers which shows the mixed Prenet penalty is very +effective for non-overlapped data. Finally, we used some real gene expression +data to show the behavior of our methods. + +
+
+ comment: This research it still in progress and need to fix some issues +
+
+
+
+
+ + ♻ ☆ Combining Inductive and Deductive Reasoning for Query Answering over + Incomplete Knowledge Graphs + + +
+ Current methods for embedding-based query answering over incomplete Knowledge +Graphs (KGs) only focus on inductive reasoning, i.e., predicting answers by +learning patterns from the data, and lack the complementary ability to do +deductive reasoning, which requires the application of domain knowledge to +infer further information. To address this shortcoming, we investigate the +problem of incorporating ontologies into embedding-based query answering models +by defining the task of embedding-based ontology-mediated query answering. We +propose various integration strategies into prominent representatives of +embedding models that involve (1) different ontology-driven data augmentation +techniques and (2) adaptation of the loss function to enforce the ontology +axioms. We design novel benchmarks for the considered task based on the LUBM +and the NELL KGs and evaluate our methods on them. The achieved improvements in +the setting that requires both inductive and deductive reasoning are from 20% +to 55% in HITS@3. + +
+
+
+
+
+ + ♻ ☆ Learning Delays in Spiking Neural Networks using Dilated Convolutions + with Learnable Spacings + + +
+ Spiking Neural Networks (SNNs) are a promising research direction for +building power-efficient information processing systems, especially for +temporal tasks such as speech recognition. In SNNs, delays refer to the time +needed for one spike to travel from one neuron to another. These delays matter +because they influence the spike arrival times, and it is well-known that +spiking neurons respond more strongly to coincident input spikes. More +formally, it has been shown theoretically that plastic delays greatly increase +the expressivity in SNNs. Yet, efficient algorithms to learn these delays have +been lacking. Here, we propose a new discrete-time algorithm that addresses +this issue in deep feedforward SNNs using backpropagation, in an offline +manner. To simulate delays between consecutive layers, we use 1D convolutions +across time. The kernels contain only a few non-zero weights - one per synapse +- whose positions correspond to the delays. These positions are learned +together with the weights using the recently proposed Dilated Convolution with +Learnable Spacings (DCLS). We evaluated our method on three datasets: the +Spiking Heidelberg Dataset (SHD), the Spiking Speech Commands (SSC) and its +non-spiking version Google Speech Commands v0.02 (GSC) benchmarks, which +require detecting temporal patterns. We used feedforward SNNs with two or three +hidden fully connected layers, and vanilla leaky integrate-and fire neurons. We +showed that fixed random delays help and that learning them helps even more. +Furthermore, our method outperformed the state-of-the-art in the three datasets +without using recurrent connections and with substantially fewer parameters. +Our work demonstrates the potential of delay learning in developing accurate +and precise models for temporal data processing. Our code is based on PyTorch / +SpikingJelly and available at: https://github.com/Thvnvtos/SNN-delays + +
+
+
+
+
+ + ♻ ☆ Transformers Meet Directed Graphs + + +
+ Transformers were originally proposed as a sequence-to-sequence model for +text but have become vital for a wide range of modalities, including images, +audio, video, and undirected graphs. However, transformers for directed graphs +are a surprisingly underexplored topic, despite their applicability to +ubiquitous domains, including source code and logic circuits. In this work, we +propose two direction- and structure-aware positional encodings for directed +graphs: (1) the eigenvectors of the Magnetic Laplacian - a direction-aware +generalization of the combinatorial Laplacian; (2) directional random walk +encodings. Empirically, we show that the extra directionality information is +useful in various downstream tasks, including correctness testing of sorting +networks and source code understanding. Together with a data-flow-centric graph +construction, our model outperforms the prior state of the art on the Open +Graph Benchmark Code2 relatively by 14.7%. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ Adaptive Uncertainty-Guided Model Selection for Data-Driven PDE + Discovery + + +
+ We propose a new parameter-adaptive uncertainty-penalized Bayesian +information criterion (UBIC) to prioritize the parsimonious partial +differential equation (PDE) that sufficiently governs noisy spatial-temporal +observed data with few reliable terms. Since the naive use of the BIC for model +selection has been known to yield an undesirable overfitted PDE, the UBIC +penalizes the found PDE not only by its complexity but also the quantified +uncertainty, derived from the model supports' coefficient of variation in a +probabilistic view. We also introduce physics-informed neural network learning +as a simulation-based approach to further validate the selected PDE flexibly +against the other discovered PDE. Numerical results affirm the successful +application of the UBIC in identifying the true governing PDE. Additionally, we +reveal an interesting effect of denoising the observed data on improving the +trade-off between the BIC score and model complexity. Code is available at +https://github.com/Pongpisit-Thanasutives/UBIC. + +
+
+ comment: 17 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ When Deep Learning Meets Polyhedral Theory: A Survey + + +
+ In the past decade, deep learning became the prevalent methodology for +predictive modeling thanks to the remarkable accuracy of deep neural networks +in tasks such as computer vision and natural language processing. Meanwhile, +the structure of neural networks converged back to simpler representations +based on piecewise constant and piecewise linear functions such as the +Rectified Linear Unit (ReLU), which became the most commonly used type of +activation function in neural networks. That made certain types of network +structure $\unicode{x2014}$such as the typical fully-connected feedforward +neural network$\unicode{x2014}$ amenable to analysis through polyhedral theory +and to the application of methodologies such as Linear Programming (LP) and +Mixed-Integer Linear Programming (MILP) for a variety of purposes. In this +paper, we survey the main topics emerging from this fast-paced area of work, +which bring a fresh perspective to understanding neural networks in more detail +as well as to applying linear optimization techniques to train, verify, and +reduce the size of such networks. + +
+
+
+
+
+ + ♻ ☆ Transformer-based interpretable multi-modal data fusion for skin lesion + classification + + +
+ A lot of deep learning (DL) research these days is mainly focused on +improving quantitative metrics regardless of other factors. In human-centered +applications, like skin lesion classification in dermatology, DL-driven +clinical decision support systems are still in their infancy due to the limited +transparency of their decision-making process. Moreover, the lack of procedures +that can explain the behavior of trained DL algorithms leads to almost no trust +from clinical physicians. To diagnose skin lesions, dermatologists rely on +visual assessment of the disease and the data gathered from the patient's +anamnesis. Data-driven algorithms dealing with multi-modal data are limited by +the separation of feature-level and decision-level fusion procedures required +by convolutional architectures. To address this issue, we enable single-stage +multi-modal data fusion via the attention mechanism of transformer-based +architectures to aid in diagnosing skin diseases. Our method beats other +state-of-the-art single- and multi-modal DL architectures in image-rich and +patient-data-rich environments. Additionally, the choice of the architecture +enables native interpretability support for the classification task both in the +image and metadata domain with no additional modifications necessary. + +
+
+ comment: Submitted to IEEE JBHI in July 2023 +
+
+
+
+
+ + ♻ ☆ Invertible normalizing flow neural networks by JKO scheme + + +
+ Normalizing flow is a class of deep generative models for efficient sampling +and density estimation. In practice, the flow often appears as a chain of +invertible neural network blocks; to facilitate training, existing works have +regularized flow trajectories and designed special network architectures. The +current paper develops a neural ODE flow network inspired by the +Jordan-Kinderleherer-Otto (JKO) scheme, which allows efficient block-wise +training of the residual blocks without sampling SDE trajectories or inner +loops of score matching or variational learning. As the JKO scheme unfolds the +dynamic of gradient flow, the proposed model naturally stacks residual network +blocks one by one, reducing the memory load and difficulty in performing +end-to-end deep flow network training. We also develop adaptive time +reparameterization of the flow network with a progressive refinement of the +trajectory in probability space, which improves the model training efficiency +and accuracy in practice. Using numerical experiments with synthetic and real +data, we show that the proposed JKO-iFlow model achieves similar or better +performance in generating new samples compared with the existing flow and +diffusion models at a significantly reduced computational and memory cost. + +
+
+
+
+
+ + ♻ ☆ Data-driven Predictive Latency for 5G: A Theoretical and Experimental + Analysis Using Network Measurements + + +
+ The advent of novel 5G services and applications with binding latency +requirements and guaranteed Quality of Service (QoS) hastened the need to +incorporate autonomous and proactive decision-making in network management +procedures. The objective of our study is to provide a thorough analysis of +predictive latency within 5G networks by utilizing real-world network data that +is accessible to mobile network operators (MNOs). In particular, (i) we present +an analytical formulation of the user-plane latency as a Hypoexponential +distribution, which is validated by means of a comparative analysis with +empirical measurements, and (ii) we conduct experimental results of +probabilistic regression, anomaly detection, and predictive forecasting +leveraging on emerging domains in Machine Learning (ML), such as Bayesian +Learning (BL) and Machine Learning on Graphs (GML). We test our predictive +framework using data gathered from scenarios of vehicular mobility, dense-urban +traffic, and social gathering events. Our results provide valuable insights +into the efficacy of predictive algorithms in practical applications. + +
+
+
+
+
+ + ♻ ☆ Generative Sliced MMD Flows with Riesz Kernels + + +
+ Maximum mean discrepancy (MMD) flows suffer from high computational costs in +large scale computations. In this paper, we show that MMD flows with Riesz +kernels $K(x,y) = - \Vert x-y\Vert^r$, $r \in (0,2)$ have exceptional +properties which allow their efficient computation. We prove that the MMD of +Riesz kernels coincides with the MMD of their sliced version. As a consequence, +the computation of gradients of MMDs can be performed in the one-dimensional +setting. Here, for $r=1$, a simple sorting algorithm can be applied to reduce +the complexity from $O(MN+N^2)$ to $O((M+N)\log(M+N))$ for two measures with +$M$ and $N$ support points. As another interesting follow-up result, the MMD of +compactly supported measures can be estimated from above and below by the +Wasserstein-1 distance. For the implementations we approximate the gradient of +the sliced MMD by using only a finite number $P$ of slices. We show that the +resulting error has complexity $O(\sqrt{d/P})$, where $d$ is the data +dimension. These results enable us to train generative models by approximating +MMD gradient flows by neural networks even for image applications. We +demonstrate the efficiency of our model by image generation on MNIST, +FashionMNIST and CIFAR10. + +
+
+
+
+
+ + ♻ ☆ Leveraging Image-based Generative Adversarial Networks for Time Series + Generation + + +
+ Generative models for images have gained significant attention in computer +vision and natural language processing due to their ability to generate +realistic samples from complex data distributions. To leverage the advances of +image-based generative models for the time series domain, we propose a +two-dimensional image representation for time series, the Extended +Intertemporal Return Plot (XIRP). Our approach captures the intertemporal time +series dynamics in a scale-invariant and invertible way, reducing training time +and improving sample quality. We benchmark synthetic XIRPs obtained by an +off-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image +representations and models regarding similarity and predictive ability metrics. +Our novel, validated image representation for time series consistently and +significantly outperforms a state-of-the-art RNN-based generative model +regarding predictive ability. Further, we introduce an improved stochastic +inversion to substantially improve simulation quality regardless of the +representation and provide the prospect of transfer potentials in other +domains. + +
+
+
+
+
+ + ♻ ☆ 0/1 Deep Neural Networks via Block Coordinate Descent + + +
+ The step function is one of the simplest and most natural activation +functions for deep neural networks (DNNs). As it counts 1 for positive +variables and 0 for others, its intrinsic characteristics (e.g., discontinuity +and no viable information of subgradients) impede its development for several +decades. Even if there is an impressive body of work on designing DNNs with +continuous activation functions that can be deemed as surrogates of the step +function, it is still in the possession of some advantageous properties, such +as complete robustness to outliers and being capable of attaining the best +learning-theoretic guarantee of predictive accuracy. Hence, in this paper, we +aim to train DNNs with the step function used as an activation function (dubbed +as 0/1 DNNs). We first reformulate 0/1 DNNs as an unconstrained optimization +problem and then solve it by a block coordinate descend (BCD) method. Moreover, +we acquire closed-form solutions for sub-problems of BCD as well as its +convergence properties. Furthermore, we also integrate +$\ell_{2,0}$-regularization into 0/1 DNN to accelerate the training process and +compress the network scale. As a result, the proposed algorithm has a high +performance on classifying MNIST and Fashion-MNIST datasets. As a result, the +proposed algorithm has a desirable performance on classifying MNIST, +FashionMNIST, Cifar10, and Cifar100 datasets. + +
+
+
+
+
+ + ♻ ☆ Principled Pruning of Bayesian Neural Networks through Variational Free + Energy Minimization + + +
+ Bayesian model reduction provides an efficient approach for comparing the +performance of all nested sub-models of a model, without re-evaluating any of +these sub-models. Until now, Bayesian model reduction has been applied mainly +in the computational neuroscience community on simple models. In this paper, we +formulate and apply Bayesian model reduction to perform principled pruning of +Bayesian neural networks, based on variational free energy minimization. Direct +application of Bayesian model reduction, however, gives rise to approximation +errors. Therefore, a novel iterative pruning algorithm is presented to +alleviate the problems arising with naive Bayesian model reduction, as +supported experimentally on the publicly available UCI datasets for different +inference algorithms. This novel parameter pruning scheme solves the +shortcomings of current state-of-the-art pruning methods that are used by the +signal processing community. The proposed approach has a clear stopping +criterion and minimizes the same objective that is used during training. Next +to these benefits, our experiments indicate better model performance in +comparison to state-of-the-art pruning schemes. + +
+
+
+
+
+ + ♻ ☆ The Role of Diverse Replay for Generalisation in Reinforcement Learning + + +
+ In reinforcement learning (RL), key components of many algorithms are the +exploration strategy and replay buffer. These strategies regulate what +environment data is collected and trained on and have been extensively studied +in the RL literature. In this paper, we investigate the impact of these +components in the context of generalisation in multi-task RL. We investigate +the hypothesis that collecting and training on more diverse data from the +training environments will improve zero-shot generalisation to new tasks. We +motivate mathematically and show empirically that generalisation to tasks that +are "reachable'' during training is improved by increasing the diversity of +transitions in the replay buffer. Furthermore, we show empirically that this +same strategy also shows improvement for generalisation to similar but +"unreachable'' tasks which could be due to improved generalisation of the +learned latent representations. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Yet Another ICU Benchmark: A Flexible Multi-Center Framework for + Clinical ML + + +
+ Medical applications of machine learning (ML) have experienced a surge in +popularity in recent years. The intensive care unit (ICU) is a natural habitat +for ML given the abundance of available data from electronic health records. +Models have been proposed to address numerous ICU prediction tasks like the +early detection of complications. While authors frequently report +state-of-the-art performance, it is challenging to verify claims of +superiority. Datasets and code are not always published, and cohort +definitions, preprocessing pipelines, and training setups are difficult to +reproduce. This work introduces Yet Another ICU Benchmark (YAIB), a modular +framework that allows researchers to define reproducible and comparable +clinical ML experiments; we offer an end-to-end solution from cohort definition +to model evaluation. The framework natively supports most open-access ICU +datasets (MIMIC III/IV, eICU, HiRID, AUMCdb) and is easily adaptable to future +ICU datasets. Combined with a transparent preprocessing pipeline and extensible +training code for multiple ML and deep learning models, YAIB enables unified +model development. Our benchmark comes with five predefined established +prediction tasks (mortality, acute kidney injury, sepsis, kidney function, and +length of stay) developed in collaboration with clinicians. Adding further +tasks is straightforward by design. Using YAIB, we demonstrate that the choice +of dataset, cohort definition, and preprocessing have a major impact on the +prediction performance - often more so than model class - indicating an urgent +need for YAIB as a holistic benchmarking tool. We provide our work to the +clinical ML community to accelerate method development and enable real-world +clinical implementations. Software Repository: +https://github.com/rvandewater/YAIB. + +
+
+ comment: Main benchmark: https://github.com/rvandewater/YAIB, Cohort + generation: https://github.com/rvandewater/YAIB-cohorts, Models: + https://github.com/rvandewater/YAIB-models +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision +transformer (CNN-Transformer) for medical image segmentation. The proposed +Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both +the convolution and self-attention mechanisms at each decoding stage with a +nominal memory and computational burden. The inclusion of multi-axis +self-attention, within each decoder stage, significantly enhances the +discriminating capacity between the object and background regions, thereby +helping in improving the segmentation efficiency. In the Hybrid Decoder block, +the fusion process commences by integrating the upsampled lower-level decoder +features, obtained through transpose convolution, with the skip-connection +features derived from the hybrid encoder. Subsequently, the fused features +undergo refinement through the utilization of a multi-axis attention mechanism. +The proposed decoder block is repeated multiple times to progressively segment +the nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset +demonstrates the effectiveness of the proposed technique. Our MaxViT-UNet +outperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet) +techniques by a considerable margin on both of the standard datasets. The +following github (https://github.com/PRLAB21/MaxViT-UNet) contains the +implementation and trained weights. + +
+
+ comment: 17 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Collage Diffusion + + +
+ We seek to give users precise control over diffusion-based image generation +by modeling complex scenes as sequences of layers, which define the desired +spatial arrangement and visual attributes of objects in the scene. Collage +Diffusion harmonizes the input layers to make objects fit together -- the key +challenge involves minimizing changes in the positions and key visual +attributes of the input layers while allowing other attributes to change in the +harmonization process. We ensure that objects are generated in the correct +locations by modifying text-image cross-attention with the layers' alpha masks. +We preserve key visual attributes of input layers by learning specialized text +representations per layer and by extending ControlNet to operate on layers. +Layer input allows users to control the extent of image harmonization on a +per-object basis, and users can even iteratively edit individual objects in +generated images while keeping other objects fixed. By leveraging the rich +information present in layer input, Collage Diffusion generates globally +harmonized images that maintain desired object characteristics better than +prior approaches. + +
+
+
+
+
+ + ♻ ☆ Knowledge Enhanced Graph Neural Networks for Graph Completion + + +
+ Graph data is omnipresent and has a wide variety of applications, such as in +natural science, social networks, or the semantic web. However, while being +rich in information, graphs are often noisy and incomplete. As a result, graph +completion tasks, such as node classification or link prediction, have gained +attention. On one hand, neural methods, such as graph neural networks, have +proven to be robust tools for learning rich representations of noisy graphs. On +the other hand, symbolic methods enable exact reasoning on graphs.We propose +Knowledge Enhanced Graph Neural Networks (KeGNN), a neuro-symbolic framework +for graph completion that combines both paradigms as it allows for the +integration of prior knowledge into a graph neural network model.Essentially, +KeGNN consists of a graph neural network as a base upon which knowledge +enhancement layers are stacked with the goal of refining predictions with +respect to prior knowledge.We instantiate KeGNN in conjunction with two +state-of-the-art graph neural networks, Graph Convolutional Networks and Graph +Attention Networks, and evaluate KeGNN on multiple benchmark datasets for node +classification. + +
+
+
+
+
+ + ♻ ☆ StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent + Disentangled Space + + +
+ One major challenge in machine learning applications is coping with +mismatches between the datasets used in the development and those obtained in +real-world applications. These mismatches may lead to inaccurate predictions +and errors, resulting in poor product quality and unreliable systems. In this +study, we propose StyleDiff to inform developers of the differences between the +two datasets for the steady development of machine learning systems. Using +disentangled image spaces obtained from recently proposed generative models, +StyleDiff compares the two datasets by focusing on attributes in the images and +provides an easy-to-understand analysis of the differences between the +datasets. The proposed StyleDiff performs in $O (d N\log N)$, where $N$ is the +size of the datasets and $d$ is the number of attributes, enabling the +application to large datasets. We demonstrate that StyleDiff accurately detects +differences between datasets and presents them in an understandable format +using, for example, driving scenes datasets. + +
+
+ comment: 25 pages, 17 figures, Image and Vision Computing +
+
+
+
+
+ + ♻ ☆ Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning ICCV 2023 + + +
+ Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful +alternative for full fine-tuning so as to adapt pre-trained vision models to +downstream tasks, which only tunes a small number of parameters while freezing +the vast majority ones to ease storage burden and optimization difficulty. +However, existing PEFT methods introduce trainable parameters to the same +positions across different tasks depending solely on human heuristics and +neglect the domain gaps. To this end, we study where to introduce and how to +allocate trainable parameters by proposing a novel Sensitivity-aware visual +Parameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates +trainable parameters to task-specific important positions given a desired +tunable parameter budget. Specifically, our SPT first quickly identifies the +sensitive parameters that require tuning for a given task in a data-dependent +way. Next, our SPT further boosts the representational capability for the +weight matrices whose number of sensitive parameters exceeds a pre-defined +threshold by utilizing existing structured tuning methods, e.g., LoRA [23] or +Adapter [22], to replace directly tuning the selected sensitive parameters +(unstructured tuning) under the budget. Extensive experiments on a wide range +of downstream recognition tasks show that our SPT is complementary to the +existing PEFT methods and largely boosts their performance, e.g., SPT improves +Adapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean +Top-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks, +respectively. Source code is at https://github.com/ziplab/SPT + +
+
+ comment: ICCV 2023 Oral +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Embeddings in the Biomedical Domain: Are They Useful? A + Look at Link Prediction, Rule Learning, and Downstream Polypharmacy Tasks + + +
+ Knowledge graphs are powerful tools for representing and organising complex +biomedical data. Several knowledge graph embedding algorithms have been +proposed to learn from and complete knowledge graphs. However, a recent study +demonstrates the limited efficacy of these embedding algorithms when applied to +biomedical knowledge graphs, raising the question of whether knowledge graph +embeddings have limitations in biomedical settings. This study aims to apply +state-of-the-art knowledge graph embedding models in the context of a recent +biomedical knowledge graph, BioKG, and evaluate their performance and potential +downstream uses. We achieve a three-fold improvement in terms of performance +based on the HITS@10 score over previous work on the same biomedical knowledge +graph. Additionally, we provide interpretable predictions through a rule-based +method. We demonstrate that knowledge graph embedding models are applicable in +practice by evaluating the best-performing model on four tasks that represent +real-life polypharmacy situations. Results suggest that knowledge learnt from +large biomedical knowledge graphs can be transferred to such downstream use +cases. Our code is available at https://github.com/aryopg/biokge. + +
+
+
+
+
+ + ♻ ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation + + +
+ Medical image data are often limited due to the expensive acquisition and +annotation process. Hence, training a deep-learning model with only raw data +can easily lead to overfitting. One solution to this problem is to augment the +raw data with various transformations, improving the model's ability to +generalize to new data. However, manually configuring a generic augmentation +combination and parameters for different datasets is non-trivial due to +inconsistent acquisition approaches and data distributions. Therefore, +automatic data augmentation is proposed to learn favorable augmentation +strategies for different datasets while incurring large GPU overhead. To this +end, we present a novel method, called Dynamic Data Augmentation (DDAug), which +is efficient and has negligible computation cost. Our DDAug develops a +hierarchical tree structure to represent various augmentations and utilizes an +efficient Monte-Carlo tree searching algorithm to update, prune, and sample the +tree. As a result, the augmentation pipeline can be optimized for each dataset +automatically. Experiments on multiple Prostate MRI datasets show that our +method outperforms the current state-of-the-art data augmentation strategies. + +
+
+
+
+
+ + ♻ ☆ Improving the Validity of Decision Trees as Explanations + + +
+ In classification and forecasting with tabular data, one often utilizes +tree-based models. Those can be competitive with deep neural networks on +tabular data [cf. Grinsztajn et al., NeurIPS 2022, arXiv:2207.08815] and, under +some conditions, explainable. The explainability depends on the depth of the +tree and the accuracy in each leaf of the tree. Decision trees containing +leaves with unbalanced accuracy can provide misleading explanations. +Low-accuracy leaves give less valid explanations, which could be interpreted as +unfairness among explanations. Here, we train a shallow tree with the objective +of minimizing the maximum misclassification error across each leaf node. Then, +we extend each leaf with a separate tree-based model. The shallow tree provides +a global explanation, while the overall statistical performance of the shallow +tree with extended leaves improves upon decision trees of unlimited depth +trained using classical methods (e.g., CART) and is comparable to +state-of-the-art methods (e.g., well-tuned XGBoost). + +
+
+
+
+
+ + ♻ ☆ Quantization-based Optimization with Perspective of Quantum Mechanics + + +
+ Statistical and stochastic analysis based on thermodynamics has been the main +analysis framework for stochastic global optimization. Recently, appearing +quantum annealing or quantum tunneling algorithm for global optimization, we +require a new researching framework for global optimization algorithms. In this +paper, we provide the analysis for quantization-based optimization based on the +Schr\"odinger equation to reveal what property in quantum mechanics enables +global optimization. We present that the tunneling effect derived by the +Schr\"odinger equation in quantization-based optimization enables to escape of +a local minimum. Additionally, we confirm that this tunneling effect is the +same property included in quantum mechanics-based global optimization. +Experiments with standard multi-modal benchmark functions represent that the +proposed analysis is valid. + +
+
+ comment: Preprint for ICTC conference (First Revision) +
+
+
+
+
+ + ♻ ☆ Online Distributed Learning with Quantized Finite-Time Coordination + + +
+ In this paper we consider online distributed learning problems. Online +distributed learning refers to the process of training learning models on +distributed data sources. In our setting a set of agents need to cooperatively +train a learning model from streaming data. Differently from federated +learning, the proposed approach does not rely on a central server but only on +peer-to-peer communications among the agents. This approach is often used in +scenarios where data cannot be moved to a centralized location due to privacy, +security, or cost reasons. In order to overcome the absence of a central +server, we propose a distributed algorithm that relies on a quantized, +finite-time coordination protocol to aggregate the locally trained models. +Furthermore, our algorithm allows for the use of stochastic gradients during +local training. Stochastic gradients are computed using a randomly sampled +subset of the local training data, which makes the proposed algorithm more +efficient and scalable than traditional gradient descent. In our paper, we +analyze the performance of the proposed algorithm in terms of the mean distance +from the online solution. Finally, we present numerical results for a logistic +regression task. + +
+
+ comment: To be presented at IEEE CDC'23 +
+
+
+
+
+ + ♻ ☆ Sequential Informed Federated Unlearning: Efficient and Provable Client + Unlearning in Federated Optimization + + +
+ The aim of Machine Unlearning (MU) is to provide theoretical guarantees on +the removal of the contribution of a given data point from a training +procedure. Federated Unlearning (FU) consists in extending MU to unlearn a +given client's contribution from a federated training routine. Current FU +approaches are generally not scalable, and do not come with sound theoretical +quantification of the effectiveness of unlearning. In this work we present +Informed Federated Unlearning (IFU), a novel efficient and quantifiable FU +approach. Upon unlearning request from a given client, IFU identifies the +optimal FL iteration from which FL has to be reinitialized, with unlearning +guarantees obtained through a randomized perturbation mechanism. The theory of +IFU is also extended to account for sequential unlearning requests. +Experimental results on different tasks and dataset show that IFU leads to more +efficient unlearning procedures as compared to basic re-training and +state-of-the-art FU approaches. + +
+
+
+
+
+ + ♻ ☆ Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning + + +
+ Federated learning (FL) enables multiple clients to collaboratively train a +global model without disclosing their data. Previous researches often require +training the complete model parameters. However, the emergence of powerful +pre-trained models makes it possible to achieve higher performance with fewer +learnable parameters in FL. In this paper, we propose a federated adaptive +prompt tuning algorithm, FedAPT, for multi-domain collaborative image +classification with powerful foundation models, like CLIP. Compared with direct +federated prompt tuning, our core idea is to adaptively unlock specific domain +knowledge for each test sample in order to provide them with personalized +prompts. To implement this idea, we design an adaptive prompt tuning module, +which consists of a meta prompt, an adaptive network, and some keys. The server +randomly generates a set of keys and assigns a unique key to each client. Then +all clients cooperatively train the global adaptive network and meta prompt +with the local datasets and the frozen keys. Ultimately, the global aggregation +model can assign a personalized prompt to CLIP based on the domain features of +each test sample. We perform extensive experiments on two multi-domain image +classification datasets across two different settings - supervised and +unsupervised. The results show that FedAPT can achieve better performance with +less than 10\% of the number of parameters of the fully trained model, and the +global model can perform well in diverse client domains simultaneously. + +
+
+
+
+
+ + ♻ ☆ RBA-GCN: Relational Bilevel Aggregation Graph Convolutional Network for + Emotion Recognition + + +
+ Emotion recognition in conversation (ERC) has received increasing attention +from researchers due to its wide range of applications.As conversation has a +natural graph structure,numerous approaches used to model ERC based on graph +convolutional networks (GCNs) have yielded significant results.However,the +aggregation approach of traditional GCNs suffers from the node information +redundancy problem,leading to node discriminant information +loss.Additionally,single-layer GCNs lack the capacity to capture long-range +contextual information from the graph. Furthermore,the majority of approaches +are based on textual modality or stitching together different modalities, +resulting in a weak ability to capture interactions between modalities. To +address these problems, we present the relational bilevel aggregation graph +convolutional network (RBA-GCN), which consists of three modules: the graph +generation module (GGM), similarity-based cluster building module (SCBM) and +bilevel aggregation module (BiAM). First, GGM constructs a novel graph to +reduce the redundancy of target node information.Then,SCBM calculates the node +similarity in the target node and its structural neighborhood, where noisy +information with low similarity is filtered out to preserve the discriminant +information of the node. Meanwhile, BiAM is a novel aggregation method that can +preserve the information of nodes during the aggregation process. This module +can construct the interaction between different modalities and capture +long-range contextual information based on similarity clusters. On both the +IEMOCAP and MELD datasets, the weighted average F1 score of RBA-GCN has a +2.17$\sim$5.21\% improvement over that of the most advanced method.Our code is +available at https://github.com/luftmenscher/RBA-GCN and our article with the +same name has been published in IEEE/ACM Transactions on Audio,Speech,and +Language Processing,vol.31,2023 + +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding and Improving Adversarial + Transferability from Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: IEEE Symposium on Security and Privacy (Oakland) 2024; Extended + version of camera-ready +
+
+
+
+
+ + ♻ ☆ Simulation-Based Optimization of User Interfaces for Quality-Assuring + Machine Learning Model Predictions + + +
+ Quality-sensitive applications of machine learning (ML) require quality +assurance (QA) by humans before the predictions of an ML model can be deployed. +QA for ML (QA4ML) interfaces require users to view a large amount of data and +perform many interactions to correct errors made by the ML model. An optimized +user interface (UI) can significantly reduce interaction costs. While UI +optimization can be informed by user studies evaluating design options, this +approach is not scalable because there are typically numerous small variations +that can affect the efficiency of a QA4ML interface. Hence, we propose using +simulation to evaluate and aid the optimization of QA4ML interfaces. In +particular, we focus on simulating the combined effects of human intelligence +in initiating appropriate interaction commands and machine intelligence in +providing algorithmic assistance for accelerating QA4ML processes. As QA4ML is +usually labor-intensive, we use the simulated task completion time as the +metric for UI optimization under different interface and algorithm setups. We +demonstrate the usage of this UI design method in several QA4ML applications. + +
+
+ comment: Published in ACM Transactions on Interactive Intelligent Systems +
+
+
+
+
+ + ♻ ☆ Extending regionalization algorithms to explore spatial process + heterogeneity + + +
+ In spatial regression models, spatial heterogeneity may be considered with +either continuous or discrete specifications. The latter is related to +delineation of spatially connected regions with homogeneous relationships +between variables (spatial regimes). Although various regionalization +algorithms have been proposed and studied in the field of spatial analytics, +methods to optimize spatial regimes have been largely unexplored. In this +paper, we propose two new algorithms for spatial regime delineation, two-stage +K-Models and Regional-K-Models. We also extend the classic Automatic Zoning +Procedure to spatial regression context. The proposed algorithms are applied to +a series of synthetic datasets and two real-world datasets. Results indicate +that all three algorithms achieve superior or comparable performance to +existing approaches, while the two-stage K-Models algorithm largely outperforms +existing approaches on model fitting, region reconstruction, and coefficient +estimation. Our work enriches the spatial analytics toolbox to explore spatial +heterogeneous processes. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Symmetry-Preserving Program Representations for Learning Code Semantics + + +
+ Large Language Models (LLMs) have shown promise in automated program +reasoning, a crucial aspect of many security tasks. However, existing LLM +architectures for code are often borrowed from other domains like natural +language processing, raising concerns about their generalization and robustness +to unseen code. A key generalization challenge is to incorporate the knowledge +of code semantics, including control and data flow, into the LLM architectures. + Drawing inspiration from examples of convolution layers exploiting +translation symmetry, we explore how code symmetries can enhance LLM +architectures for program analysis and modeling. We present a rigorous +group-theoretic framework that formally defines code symmetries as +semantics-preserving transformations and provides techniques for precisely +reasoning about symmetry preservation within LLM architectures. Using this +framework, we introduce a novel variant of self-attention that preserves +program symmetries, demonstrating its effectiveness in generalization and +robustness through detailed experimental evaluations across different binary +and source code analysis tasks. Overall, our code symmetry framework offers +rigorous and powerful reasoning techniques that can guide the future +development of specialized LLMs for code and advance LLM-guided program +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ On-Demand Communication for Asynchronous Multi-Agent Bandits AISTATS 2023 + + +
+ This paper studies a cooperative multi-agent multi-armed stochastic bandit +problem where agents operate asynchronously -- agent pull times and rates are +unknown, irregular, and heterogeneous -- and face the same instance of a +K-armed bandit problem. Agents can share reward information to speed up the +learning process at additional communication costs. We propose ODC, an +on-demand communication protocol that tailors the communication of each pair of +agents based on their empirical pull times. ODC is efficient when the pull +times of agents are highly heterogeneous, and its communication complexity +depends on the empirical pull times of agents. ODC is a generic protocol that +can be integrated into most cooperative bandit algorithms without degrading +their performance. We then incorporate ODC into the natural extensions of UCB +and AAE algorithms and propose two communication-efficient cooperative +algorithms. Our analysis shows that both algorithms are near-optimal in regret. + +
+
+ comment: Accepted by AISTATS 2023 +
+
+
+
+
+ + ♻ ☆ Visual correspondence-based explanations improve AI robustness and + human-AI team accuracy NeurIPS 2022 + + +
+ Explaining artificial intelligence (AI) predictions is increasingly important +and even imperative in many high-stakes applications where humans are the +ultimate decision-makers. In this work, we propose two novel architectures of +self-interpretable image classifiers that first explain, and then predict (as +opposed to post-hoc explanations) by harnessing the visual correspondences +between a query image and exemplars. Our models consistently improve (by 1 to 4 +points) on out-of-distribution (OOD) datasets while performing marginally worse +(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest +neighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB, +our correspondence-based explanations are found to be more useful to users than +kNN explanations. Our explanations help users more accurately reject AI's wrong +decisions than all other tested methods. Interestingly, for the first time, we +show that it is possible to achieve complementary human-AI team accuracy (i.e., +that is higher than either AI-alone or human-alone), in ImageNet and CUB image +classification tasks. + +
+
+ comment: NeurIPS 2022 conference paper +
+
+
+
+
+ + ♻ ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ♻ ☆ pTSE: A Multi-model Ensemble Method for Probabilistic Time Series + Forecasting IJCAI 2023 + + +
+ Various probabilistic time series forecasting models have sprung up and shown +remarkably good performance. However, the choice of model highly relies on the +characteristics of the input time series and the fixed distribution that the +model is based on. Due to the fact that the probability distributions cannot be +averaged over different models straightforwardly, the current time series model +ensemble methods cannot be directly applied to improve the robustness and +accuracy of forecasting. To address this issue, we propose pTSE, a multi-model +distribution ensemble method for probabilistic forecasting based on Hidden +Markov Model (HMM). pTSE only takes off-the-shelf outputs from member models +without requiring further information about each model. Besides, we provide a +complete theoretical analysis of pTSE to prove that the empirical distribution +of time series subject to an HMM will converge to the stationary distribution +almost surely. Experiments on benchmarks show the superiority of pTSE overall +member models and competitive ensemble methods. + +
+
+ comment: The 32nd International Joint Conference on Artificial Intelligence + (IJCAI 2023) +
+
+
+
+
+ + ♻ ☆ Stochastic Configuration Machines for Industrial Artificial Intelligence + + +
+ Real-time predictive modelling with desired accuracy is highly expected in +industrial artificial intelligence (IAI), where neural networks play a key +role. Neural networks in IAI require powerful, high-performance computing +devices to operate a large number of floating point data. Based on stochastic +configuration networks (SCNs), this paper proposes a new randomized learner +model, termed stochastic configuration machines (SCMs), to stress effective +modelling and data size saving that are useful and valuable for industrial +applications. Compared to SCNs and random vector functional-link (RVFL) nets +with binarized implementation, the model storage of SCMs can be significantly +compressed while retaining favourable prediction performance. Besides the +architecture of the SCM learner model and its learning algorithm, as an +important part of this contribution, we also provide a theoretical basis on the +learning capacity of SCMs by analysing the model's complexity. Experimental +studies are carried out over some benchmark datasets and three industrial +applications. The results demonstrate that SCM has great potential for dealing +with industrial data analytics. + +
+
+ comment: 23 pages, 7 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ MGNN: Graph Neural Networks Inspired by Distance Geometry Problem KDD 2023 + + +
+ Graph Neural Networks (GNNs) have emerged as a prominent research topic in +the field of machine learning. Existing GNN models are commonly categorized +into two types: spectral GNNs, which are designed based on polynomial graph +filters, and spatial GNNs, which utilize a message-passing scheme as the +foundation of the model. For the expressive power and universality of spectral +GNNs, a natural approach is to improve the design of basis functions for better +approximation ability. As for spatial GNNs, models like Graph Isomorphism +Networks (GIN) analyze their expressive power based on Graph Isomorphism Tests. +Recently, there have been attempts to establish connections between spatial +GNNs and geometric concepts like curvature and cellular sheaves, as well as +physical phenomena like oscillators. However, despite the recent progress, +there is still a lack of comprehensive analysis regarding the universality of +spatial GNNs from the perspectives of geometry and physics. In this paper, we +propose MetricGNN (MGNN), a spatial GNN model inspired by the +congruent-insensitivity property of classifiers in the classification phase of +GNNs. We demonstrate that a GNN model is universal in the spatial domain if it +can generate embedding matrices that are congruent to any given embedding +matrix. This property is closely related to the Distance Geometry Problem +(DGP). Since DGP is an NP-Hard combinatorial optimization problem, we propose +optimizing an energy function derived from spring networks and the +Multi-Dimensional Scaling (MDS) problem. This approach also allows our model to +handle both homophilic and heterophilic graphs. Finally, we propose employing +the iteration method to optimize our energy function. We extensively evaluate +the effectiveness of our model through experiments conducted on both synthetic +and real-world datasets. Our code is available at: +https://github.com/GuanyuCui/MGNN. + +
+
+ comment: Accepted by KDD 2023 +
+
+
+
+
+ + ♻ ☆ Fair Attribute Completion on Graph with Missing Attributes + + +
+ Tackling unfairness in graph learning models is a challenging task, as the +unfairness issues on graphs involve both attributes and topological structures. +Existing work on fair graph learning simply assumes that attributes of all +nodes are available for model training and then makes fair predictions. In +practice, however, the attributes of some nodes might not be accessible due to +missing data or privacy concerns, which makes fair graph learning even more +challenging. In this paper, we propose FairAC, a fair attribute completion +method, to complement missing information and learn fair node embeddings for +graphs with missing attributes. FairAC adopts an attention mechanism to deal +with the attribute missing problem and meanwhile, it mitigates two types of +unfairness, i.e., feature unfairness from attributes and topological unfairness +due to attribute completion. FairAC can work on various types of homogeneous +graphs and generate fair embeddings for them and thus can be applied to most +downstream tasks to improve their fairness performance. To our best knowledge, +FairAC is the first method that jointly addresses the graph attribution +completion and graph unfairness problems. Experimental results on benchmark +datasets show that our method achieves better fairness performance with less +sacrifice in accuracy, compared with the state-of-the-art methods of fair graph +learning. Code is available at: https://github.com/donglgcn/FairAC. + +
+
+
+
+
+ + ♻ ☆ Backpropagation through Back Substitution with a Backslash + + +
+ We present a linear algebra formulation of backpropagation which allows the +calculation of gradients by using a generically written ``backslash'' or +Gaussian elimination on triangular systems of equations. Generally, the matrix +elements are operators. This paper has three contributions: (i) it is of +intellectual value to replace traditional treatments of automatic +differentiation with a (left acting) operator theoretic, graph-based approach; +(ii) operators can be readily placed in matrices in software in programming +languages such as Julia as an implementation option; (iii) we introduce a novel +notation, ``transpose dot'' operator ``$\{\}^{T_\bullet}$'' that allows for the +reversal of operators. + We further demonstrate the elegance of the operators approach in a suitable +programming language consisting of generic linear algebra operators such as +Julia \cite{bezanson2017julia}, and that it is possible to realize this +abstraction in code. Our implementation shows how generic linear algebra can +allow operators as elements of matrices. In contrast to ``operator +overloading,'' where backslash would normally have to be rewritten to take +advantage of operators, with ``generic programming'' there is no such need. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Neuronal diversity can improve machine learning for physics and beyond + + +
+ Diversity conveys advantages in nature, yet homogeneous neurons typically +comprise the layers of artificial neural networks. Here we construct neural +networks from neurons that learn their own activation functions, quickly +diversify, and subsequently outperform their homogeneous counterparts on image +classification and nonlinear regression tasks. Sub-networks instantiate the +neurons, which meta-learn especially efficient sets of nonlinear responses. +Examples include conventional neural networks classifying digits and +forecasting a van der Pol oscillator and physics-informed Hamiltonian neural +networks learning H\'enon-Heiles stellar orbits and the swing of a video +recorded pendulum clock. Such \textit{learned diversity} provides examples of +dynamical systems selecting diversity over uniformity and elucidates the role +of diversity in natural and artificial systems. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Learning Optimal Strategies for Temporal Tasks in Stochastic Games + + +
+ Synthesis from linear temporal logic (LTL) specifications provides assured +controllers for systems operating in stochastic and potentially adversarial +environments. Automatic synthesis tools, however, require a model of the +environment to construct controllers. In this work, we introduce a model-free +reinforcement learning (RL) approach to derive controllers from given LTL +specifications even when the environment is completely unknown. We model the +problem as a stochastic game (SG) between the controller and the adversarial +environment; we then learn optimal control strategies that maximize the +probability of satisfying the LTL specifications against the worst-case +environment behavior. We first construct a product game using the deterministic +parity automaton (DPA) translated from the given LTL specification. By deriving +distinct rewards and discount factors from the acceptance condition of the DPA, +we reduce the maximization of the worst-case probability of satisfying the LTL +specification into the maximization of a discounted reward objective in the +product game; this enables the use of model-free RL algorithms to learn an +optimal controller strategy. To deal with the common scalability problems when +the number of sets defining the acceptance condition of the DPA (usually +referred as colors), is large, we propose a lazy color generation method where +distinct rewards and discount factors are utilized only when needed, and an +approximate method where the controller eventually focuses on only one color. +In several case studies, we show that our approach is scalable to a wide range +of LTL formulas, significantly outperforming existing methods for learning +controllers from LTL specifications in SGs. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Terrain Diffusion Network: Climatic-Aware Terrain Generation with + Geological Sketch Guidance + + +
+ Sketch-based terrain generation seeks to create realistic landscapes for +virtual environments in various applications such as computer games, animation +and virtual reality. Recently, deep learning based terrain generation has +emerged, notably the ones based on generative adversarial networks (GAN). +However, these methods often struggle to fulfill the requirements of flexible +user control and maintain generative diversity for realistic terrain. +Therefore, we propose a novel diffusion-based method, namely terrain diffusion +network (TDN), which actively incorporates user guidance for enhanced +controllability, taking into account terrain features like rivers, ridges, +basins, and peaks. Instead of adhering to a conventional monolithic denoising +process, which often compromises the fidelity of terrain details or the +alignment with user control, a multi-level denoising scheme is proposed to +generate more realistic terrains by taking into account fine-grained details, +particularly those related to climatic patterns influenced by erosion and +tectonic activities. Specifically, three terrain synthesisers are designed for +structural, intermediate, and fine-grained level denoising purposes, which +allow each synthesiser concentrate on a distinct terrain aspect. Moreover, to +maximise the efficiency of our TDN, we further introduce terrain and sketch +latent spaces for the synthesizers with pre-trained terrain autoencoders. +Comprehensive experiments on a new dataset constructed from NASA Topology +Images clearly demonstrate the effectiveness of our proposed method, achieving +the state-of-the-art performance. Our code and dataset will be publicly +available. + +
+
+
+
+
+ + ☆ End-Edge Coordinated Joint Encoding and Neural Enhancement for Low-Light + Video Analytics + + +
+ In this paper, we investigate video analytics in low-light environments, and +propose an end-edge coordinated system with joint video encoding and +enhancement. It adaptively transmits low-light videos from cameras and performs +enhancement and inference tasks at the edge. Firstly, according to our +observations, both encoding and enhancement for low-light videos have a +significant impact on inference accuracy, which directly influences bandwidth +and computation overhead. Secondly, due to the limitation of built-in +computation resources, cameras perform encoding and transmitting frames to the +edge. The edge executes neural enhancement to process low contrast, detail +loss, and color distortion on low-light videos before inference. Finally, an +adaptive controller is designed at the edge to select quantization parameters +and scales of neural enhancement networks, aiming to improve the inference +accuracy and meet the latency requirements. Extensive real-world experiments +demon-strate that, the proposed system can achieve a better trade-off between +communication and computation resources and optimize the inference accuracy. + +
+
+
+
+
+ + ☆ Edge-Assisted Lightweight Region-of-Interest Extraction and Transmission + for Vehicle Perception + + +
+ To enhance on-road environmental perception for autonomous driving, accurate +and real-time analytics on high-resolution video frames generated from on-board +cameras be-comes crucial. In this paper, we design a lightweight object +location method based on class activation mapping (CAM) to rapidly capture the +region of interest (RoI) boxes that contain driving safety related objects from +on-board cameras, which can not only improve the inference accuracy of vision +tasks, but also reduce the amount of transmitted data. Considering the limited +on-board computation resources, the RoI boxes extracted from the raw image are +offloaded to the edge for further processing. Considering both the dynamics of +vehicle-to-edge communications and the limited edge resources, we propose an +adaptive RoI box offloading algorithm to ensure prompt and accurate inference +by adjusting the down-sampling rate of each box. Extensive experimental results +on four high-resolution video streams demonstrate that our approach can +effectively improve the overall accuracy by up to 16% and reduce the +transmission demand by up to 49%, compared with other benchmarks. + +
+
+
+
+
+ + ☆ Edge-Assisted On-Device Model Update for Video Analytics in Adverse + Environments + + +
+ While large deep neural networks excel at general video analytics tasks, the +significant demand on computing capacity makes them infeasible for real-time +inference on resource-constrained end cam-eras. In this paper, we propose an +edge-assisted framework that continuously updates the lightweight model +deployed on the end cameras to achieve accurate predictions in adverse +environments. This framework consists of three modules, namely, a key frame +extractor, a trigger controller, and a retraining manager. The low-cost key +frame extractor obtains frames that can best represent the current environment. +Those frames are then transmitted and buffered as the retraining data for model +update at the edge server. Once the trigger controller detects a significant +accuracy drop in the selected frames, the retraining manager outputs the +optimal retraining configuration balancing the accuracy and time cost. We +prototype our system on two end devices of different computing capacities with +one edge server. The results demonstrate that our approach significantly +improves accuracy across all tested adverse environment scenarios (up to 24%) +and reduces more than 50% of the retraining time compared to existing +benchmarks. + +
+
+
+
+
+ + ☆ Separate and Locate: Rethink the Text in Text-based Visual Question + Answering ACM MM 2023 + + +
+ Text-based Visual Question Answering (TextVQA) aims at answering questions +about the text in images. Most works in this field focus on designing network +structures or pre-training tasks. All these methods list the OCR texts in +reading order (from left to right and top to bottom) to form a sequence, which +is treated as a natural language ``sentence''. However, they ignore the fact +that most OCR words in the TextVQA task do not have a semantical contextual +relationship. In addition, these approaches use 1-D position embedding to +construct the spatial relation between OCR tokens sequentially, which is not +reasonable. The 1-D position embedding can only represent the left-right +sequence relationship between words in a sentence, but not the complex spatial +position relationship. To tackle these problems, we propose a novel method +named Separate and Locate (SaL) that explores text contextual cues and designs +spatial position embedding to construct spatial relations between OCR texts. +Specifically, we propose a Text Semantic Separate (TSS) module that helps the +model recognize whether words have semantic contextual relations. Then, we +introduce a Spatial Circle Position (SCP) module that helps the model better +construct and reason the spatial position relationships between OCR texts. Our +SaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA +and ST-VQA datasets. Compared with the pre-training state-of-the-art method +pre-trained on 64 million pre-training samples, our method, without any +pre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on +TextVQA and ST-VQA. Our code and models will be released at +https://github.com/fangbufang/SaL. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks, such as Reddit discussions. In contrast to traditional comment-only +methods, our approach to labelling a comment as hate speech involves a holistic +analysis of text and images grounded in the discussion context. This is done by +leveraging graph transformers to capture the contextual relationships in the +entire discussion surrounding a comment and grounding the interwoven fusion +layers that combine individual comments' text and image embeddings instead of +processing modalities separately. We compare the performance of our model to +baselines that only process individual comments and conduct extensive ablation +studies. To evaluate our work, we present a new dataset, HatefulDiscussions, +comprising complete multi-modal discussions from multiple online communities on +Reddit. We conclude with future work for multimodal solutions to deliver social +value in online contexts, arguing that capturing a holistic view of a +conversation significantly advances the effort to detect anti-social behaviour. + +
+
+ comment: Under Submission +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 49 + +
+
+
+ + ☆ Quantifying Uncertainty in Answers from any Language Model via Intrinsic + and Extrinsic Confidence Assessment + + +
+ We introduce BSDetector, a method for detecting bad and speculative answers +from a pretrained Large Language Model by estimating a numeric confidence score +for any output it generated. Our uncertainty quantification technique works for +any LLM accessible only via a black-box API, and combines intrinsic and +extrinsic assessments of confidence into a single trustworthiness estimate for +any LLM response to a given prompt. Our method is extremely general and can +applied to all of the best LLMs available today (whose training data remains +unknown). By expending a bit of extra computation, users of any LLM API can now +get the same response as they would ordinarily, as well as a confidence +estimate that caution when not to trust this response. Experiments on both +closed and open-form Question-Answer benchmarks reveal that BSDetector more +accurately identifies incorrect LLM responses than alternative uncertainty +estimation procedures (for both GPT-3 and ChatGPT). By sampling multiple +responses from the LLM and considering the one with the highest confidence +score, we can additionally obtain more accurate responses from the same LLM, +without any extra training steps. + +
+
+
+
+
+ + ☆ Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open + Generative Large Language Models + + +
+ We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric +foundation and instruction-tuned open generative large language models (LLMs). +The models are based on the GPT-3 decoder-only architecture and are pretrained +on a mixture of Arabic and English texts, including source code in various +programming languages. With 13 billion parameters, they demonstrate better +knowledge and reasoning capabilities in Arabic than any existing open Arabic +and multilingual models by a sizable margin, based on extensive evaluation. +Moreover, the models are competitive in English compared to English-centric +open models of similar size, despite being trained on much less English data. +We provide a detailed description of the training, the tuning, the safety +alignment, and the evaluation of the models. We release two open versions of +the model -- the foundation Jais model, and an instruction-tuned Jais-chat +variant -- with the aim of promoting research on Arabic LLMs. Available at +https://huggingface.co/inception-mbzuai/jais-13b-chat + +
+
+ comment: Arabic-centric, foundation model, large-language model, LLM, + generative model, instruction-tuned, Jais, Jais-chat +
+
+
+
+
+ + ☆ LM-Infinite: Simple On-the-Fly Length Generalization for Large Language + Models + + +
+ In recent years, there have been remarkable advancements in the performance +of Transformer-based Large Language Models (LLMs) across various domains. As +these LLMs are deployed for increasingly complex tasks, they often face the +needs to conduct longer reasoning processes or understanding larger contexts. +In these situations, the length generalization failure of LLMs on long +sequences become more prominent. Most pre-training schemes truncate training +sequences to a fixed length (such as 2048 for LLaMa). LLMs often struggle to +generate fluent texts, let alone carry out downstream tasks, after longer +contexts, even with relative positional encoding which is designed to cope with +this problem. Common solutions such as finetuning on longer corpora often +involves daunting hardware and time costs and requires careful training process +design. To more efficiently leverage the generation capacity of existing LLMs, +we theoretically and empirically investigate the main out-of-distribution (OOD) +factors contributing to this problem. Inspired by this diagnosis, we propose a +simple yet effective solution for on-the-fly length generalization, +LM-Infinite, which involves only a $\Lambda$-shaped attention mask and a +distance limit while requiring no parameter updates or learning. We find it +applicable to a variety of LLMs using relative-position encoding methods. +LM-Infinite is computational efficient with $O(n)$ time and space, and +demonstrates consistent fluency and generation quality to as long as 32k tokens +on ArXiv and OpenWebText2 datasets, with 2.72x decoding speedup. On downstream +task such as passkey retrieval, it continues to work on inputs much longer than +training lengths where vanilla models fail immediately. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Response: Emergent analogical reasoning in large language models + + +
+ In their recent Nature Human Behaviour paper, "Emergent analogical reasoning +in large language models," (Webb, Holyoak, and Lu, 2023) the authors argue that +"large language models such as GPT-3 have acquired an emergent ability to find +zero-shot solutions to a broad range of analogy problems." In this response, we +provide counterexamples of the letter string analogies. In our tests, GPT-3 +fails to solve even the easiest variants of the problems presented in the +original paper. Zero-shot reasoning is an extraordinary claim that requires +extraordinary evidence. We do not see that evidence in our experiments. To +strengthen claims of humanlike reasoning such as zero-shot reasoning, it is +important that the field develop approaches that rule out data memorization. + +
+
+ comment: Response to publication in Nature Human Behaviour titled "Emergent + analogical reasoning in large language models," (Webb, Holyoak, and Lu, 2023, + arXiv:2212.09196). 9 pages +
+
+
+
+
+ + ☆ Grandma Karl is 27 years old -- research agenda for pseudonymization of + research data + + +
+ Accessibility of research data is critical for advances in many research +fields, but textual data often cannot be shared due to the personal and +sensitive information which it contains, e.g names or political opinions. +General Data Protection Regulation (GDPR) suggests pseudonymization as a +solution to secure open access to research data, but we need to learn more +about pseudonymization as an approach before adopting it for manipulation of +research data. This paper outlines a research agenda within pseudonymization, +namely need of studies into the effects of pseudonymization on unstructured +data in relation to e.g. readability and language assessment, as well as the +effectiveness of pseudonymization as a way of protecting writer identity, while +also exploring different ways of developing context-sensitive algorithms for +detection, labelling and replacement of personal information in unstructured +data. The recently granted project on pseudonymization Grandma Karl is 27 years +old addresses exactly those challenges. + +
+
+ comment: Big DataService 2023 conference, 2023 Workshop on Big Data and + Machine Learning with Privacy Enhancing Tech, IEEE Catalog Number: + CFP23A91-ART, ISBN: 979-8-3503-3379-4 +
+
+
+
+
+ + ☆ Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for + English to Indian Languages + + +
+ The study investigates the effectiveness of utilizing multimodal information +in Neural Machine Translation (NMT). While prior research focused on using +multimodal data in low-resource scenarios, this study examines how image +features impact translation when added to a large-scale, pre-trained unimodal +NMT system. Surprisingly, the study finds that images might be redundant in +this context. Additionally, the research introduces synthetic noise to assess +whether images help the model deal with textual noise. Multimodal models +slightly outperform text-only models in noisy settings, even with random +images. The study's experiments translate from English to Hindi, Bengali, and +Malayalam, outperforming state-of-the-art benchmarks significantly. +Interestingly, the effect of visual context varies with source text noise: no +visual context works best for non-noisy translations, cropped image features +are optimal for low noise, and full image features work better in high-noise +scenarios. This sheds light on the role of visual context, especially in noisy +settings, opening up a new research direction for Noisy Neural Machine +Translation in multimodal setups. The research emphasizes the importance of +combining visual and textual information for improved translation in various +environments. + +
+
+
+
+
+ + ☆ Conti Inc.: Understanding the Internal Discussions of a large + Ransomware-as-a-Service Operator with Machine Learning + + +
+ Ransomware-as-a-service (RaaS) is increasing the scale and complexity of +ransomware attacks. Understanding the internal operations behind RaaS has been +a challenge due to the illegality of such activities. The recent chat leak of +the Conti RaaS operator, one of the most infamous ransomware operators on the +international scene, offers a key opportunity to better understand the inner +workings of such organizations. This paper analyzes the main topic discussions +in the Conti chat leak using machine learning techniques such as Natural +Language Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as +visualization strategies. Five discussion topics are found: 1) Business, 2) +Technical, 3) Internal tasking/Management, 4) Malware, and 5) Customer +Service/Problem Solving. Moreover, the distribution of topics among Conti +members shows that only 4% of individuals have specialized discussions while +almost all individuals (96%) are all-rounders, meaning that their discussions +revolve around the five topics. The results also indicate that a significant +proportion of Conti discussions are non-tech related. This study thus +highlights that running such large RaaS operations requires a workforce skilled +beyond technical abilities, with individuals involved in various tasks, from +management to customer service or problem solving. The discussion topics also +show that the organization behind the Conti RaaS oper5086933ator shares +similarities with a large firm. We conclude that, although RaaS represents an +example of specialization in the cybercrime industry, only a few members are +specialized in one topic, while the rest runs and coordinates the RaaS +operation. + +
+
+
+
+
+ + ☆ Text-to-OverpassQL: A Natural Language Interface for Complex Geodata + Querying of OpenStreetMap + + +
+ We present Text-to-OverpassQL, a task designed to facilitate a natural +language interface for querying geodata from OpenStreetMap (OSM). The Overpass +Query Language (OverpassQL) allows users to formulate complex database queries +and is widely adopted in the OSM ecosystem. Generating Overpass queries from +natural language input serves multiple use-cases. It enables novice users to +utilize OverpassQL without prior knowledge, assists experienced users with +crafting advanced queries, and enables tool-augmented large language models to +access information stored in the OSM database. In order to assess the +performance of current sequence generation models on this task, we propose +OverpassNL, a dataset of 8,352 queries with corresponding natural language +inputs. We further introduce task specific evaluation metrics and ground the +evaluation of the Text-to-OverpassQL task by executing the queries against the +OSM database. We establish strong baselines by finetuning sequence-to-sequence +models and adapting large language models with in-context examples. The +detailed evaluation reveals strengths and weaknesses of the considered learning +strategies, laying the foundations for further research into the +Text-to-OverpassQL task. + +
+
+
+
+
+ + ☆ AsyncET: Asynchronous Learning for Knowledge Graph Entity Typing with + Auxiliary Relations + + +
+ Knowledge graph entity typing (KGET) is a task to predict the missing entity +types in knowledge graphs (KG). Previously, KG embedding (KGE) methods tried to +solve the KGET task by introducing an auxiliary relation, 'hasType', to model +the relationship between entities and their types. However, a single auxiliary +relation has limited expressiveness for diverse entity-type patterns. We +improve the expressiveness of KGE methods by introducing multiple auxiliary +relations in this work. Similar entity types are grouped to reduce the number +of auxiliary relations and improve their capability to model entity-type +patterns with different granularities. With the presence of multiple auxiliary +relations, we propose a method adopting an Asynchronous learning scheme for +Entity Typing, named AsyncET, which updates the entity and type embeddings +alternatively to keep the learned entity embedding up-to-date and informative +for entity type prediction. Experiments are conducted on two commonly used KGET +datasets to show that the performance of KGE methods on the KGET task can be +substantially improved by the proposed multiple auxiliary relations and +asynchronous embedding learning. Furthermore, our method has a significant +advantage over state-of-the-art methods in model sizes and time complexity. + +
+
+
+
+
+ + ☆ FPTQ: Fine-grained Post-Training Quantization for Large Language Models + + +
+ In the era of large-scale language models, the substantial parameter size +poses significant challenges for deployment. Being a prevalent compression +technique, quantization has emerged as the mainstream practice to tackle this +issue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and +activations in such bit widths). In this study, we propose a novel W4A8 +post-training quantization method for the available open-sourced LLMs, which +combines the advantages of both two recipes. Therefore, we can leverage the +benefit in the I/O utilization of 4-bit weight quantization and the +acceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces +notorious performance degradation. As a remedy, we involve layerwise activation +quantization strategies which feature a novel logarithmic equalization for most +intractable layers, and we combine them with fine-grained weight quantization. +Without whistles and bells, we eliminate the necessity for further fine-tuning +and obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and +LLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is +achievable for the deployment of large language models, fostering their +wide-spreading real-world applications. + +
+
+
+
+
+ + ☆ MerA: Merging Pretrained Adapters For Few-Shot Learning + + +
+ Adapter tuning, which updates only a few parameters, has become a mainstream +method for fine-tuning pretrained language models to downstream tasks. However, +it often yields subpar results in few-shot learning. AdapterFusion, which +assembles pretrained adapters using composition layers tailored to specific +tasks, is a possible solution but significantly increases trainable parameters +and deployment costs. Despite this, our preliminary study reveals that even +single adapters can outperform Adapterfusion in few-shot learning, urging us to +propose \textbf{\texttt{Merging Pretrained Adapters}} (MerA) that efficiently +incorporates pretrained adapters to a single model through model fusion. +Extensive experiments on two PLMs demonstrate that MerA achieves substantial +improvements compared to both single adapters and AdapterFusion. To further +enhance the capacity of MerA, we also introduce a simple yet effective +technique, referred to as the "\textit{same-track}" setting, that merges +adapters from the same track of pretraining tasks. With the implementation of +the "\textit{same-track}" setting, we observe even more impressive gains, +surpassing the performance of both full fine-tuning and adapter tuning by a +substantial margin, e.g., 3.5\% in MRPC and 5.0\% in MNLI. + +
+
+
+
+
+ + ☆ Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting + + +
+ The task of radiology reporting comprises describing and interpreting the +medical findings in radiographic images, including description of their +location and appearance. Automated approaches to radiology reporting require +the image to be encoded into a suitable token representation for input to the +language model. Previous methods commonly use convolutional neural networks to +encode an image into a series of image-level feature map representations. +However, the generated reports often exhibit realistic style but imperfect +accuracy. Inspired by recent works for image captioning in the general domain +in which each visual token corresponds to an object detected in an image, we +investigate whether using local tokens corresponding to anatomical structures +can improve the quality of the generated reports. We introduce a novel +adaptation of Faster R-CNN in which finding detection is performed for the +candidate bounding boxes extracted during anatomical structure localisation. We +use the resulting bounding box feature representations as our set of +finding-aware anatomical tokens. This encourages the extracted anatomical +tokens to be informative about the findings they contain (required for the +final task of radiology reporting). Evaluating on the MIMIC-CXR dataset of +chest X-Ray images, we show that task-aware anatomical tokens give +state-of-the-art performance when integrated into an automated reporting +pipeline, yielding generated reports with improved clinical accuracy. + +
+
+
+
+
+ + ☆ Benchmarking Multilabel Topic Classification in the Kyrgyz Language + + +
+ Kyrgyz is a very underrepresented language in terms of modern natural +language processing resources. In this work, we present a new public benchmark +for topic classification in Kyrgyz, introducing a dataset based on collected +and annotated data from the news site 24.KG and presenting several baseline +models for news classification in the multilabel setting. We train and evaluate +both classical statistical and neural models, reporting the scores, discussing +the results, and proposing directions for future work. + +
+
+ comment: Accepted to AIST 2023 +
+
+
+
+
+ + ☆ LLaSM: Large Language and Speech Model + + +
+ Multi-modal large language models have garnered significant interest +recently. Though, most of the works focus on vision-language multi-modal models +providing strong capabilities in following vision-and-language instructions. +However, we claim that speech is also an important modality through which +humans interact with the world. Hence, it is crucial for a general-purpose +assistant to be able to follow multi-modal speech-and-language instructions. In +this work, we propose Large Language and Speech Model (LLaSM). LLaSM is an +end-to-end trained large multi-modal speech-language model with cross-modal +conversational abilities, capable of following speech-and-language +instructions. Our early experiments show that LLaSM demonstrates a more +convenient and natural way for humans to interact with artificial intelligence. +Specifically, we also release a large Speech Instruction Following dataset +LLaSM-Audio-Instructions. Code and demo are available at +https://github.com/LinkSoul-AI/LLaSM and +https://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions +dataset is available at +https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions. + +
+
+
+
+
+ + ☆ Is the U.S. Legal System Ready for AI's Challenges to Human Values? + + +
+ Our interdisciplinary study investigates how effectively U.S. laws confront +the challenges posed by Generative AI to human values. Through an analysis of +diverse hypothetical scenarios crafted during an expert workshop, we have +identified notable gaps and uncertainties within the existing legal framework +regarding the protection of fundamental values, such as autonomy, privacy, +dignity, diversity, equality, and physical/mental well-being. Constitutional +and civil rights, it appears, may not provide sufficient protection against +AI-generated discriminatory outputs. Furthermore, even if we exclude the +liability shield provided by Section 230, proving causation for defamation and +product liability claims is a challenging endeavor due to the intricate and +opaque nature of AI systems. To address the unique and unforeseeable threats +posed by Generative AI, we advocate for legal frameworks that evolve to +recognize new threat and provide proactive, auditable guidelines to industry +stakeholders. Addressing these issues requires deep interdisciplinary +collaborations to identify harms, values, and mitigation strategies. + +
+
+ comment: 26 pages, 7 figures +
+
+
+
+
+ + ☆ Towards One-Shot Learning for Text Classification using Inductive Logic + Programming + + +
+ With the ever-increasing potential of AI to perform personalised tasks, it is +becoming essential to develop new machine learning techniques which are +data-efficient and do not require hundreds or thousands of training data. In +this paper, we explore an Inductive Logic Programming approach for one-shot +text classification. In particular, we explore the framework of +Meta-Interpretive Learning (MIL), along with using common-sense background +knowledge extracted from ConceptNet. Results indicate that MIL can learn text +classification rules from a small number of training examples. Moreover, the +higher complexity of chosen examples, the higher accuracy of the outcome. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Knowledge-grounded Natural Language Recommendation Explanation + + +
+ Explanations accompanied by a recommendation can assist users in +understanding the decision made by recommendation systems, which in turn +increases a user's confidence and trust in the system. Recently, research has +focused on generating natural language explanations in a human-readable format. +Thus far, the proposed approaches leverage item reviews written by users, which +are often subjective, sparse in language, and unable to account for new items +that have not been purchased or reviewed before. Instead, we aim to generate +fact-grounded recommendation explanations that are objectively described with +item features while implicitly considering a user's preferences, based on the +user's purchase history. To achieve this, we propose a knowledge graph (KG) +approach to natural language explainable recommendation. Our approach draws on +user-item features through a novel collaborative filtering-based KG +representation to produce fact-grounded, personalized explanations, while +jointly learning user-item representations for recommendation scoring. +Experimental results show that our approach consistently outperforms previous +state-of-the-art models on natural language explainable recommendation. + +
+
+
+
+
+ + ☆ Peering Through Preferences: Unraveling Feedback Acquisition for + Aligning Large Language Models + + +
+ Aligning large language models (LLMs) with human values and intents +critically involves the use of human or AI feedback. While dense feedback +annotations are expensive to acquire and integrate, sparse feedback presents a +structural design choice between ratings (e.g., score Response A on a scale of +1-7) and rankings (e.g., is Response A better than Response B?). In this work, +we analyze the effect of this design choice for the alignment and evaluation of +LLMs. We uncover an inconsistency problem wherein the preferences inferred from +ratings and rankings significantly disagree 60% for both human and AI +annotators. Our subsequent analysis identifies various facets of annotator +biases that explain this phenomena, such as human annotators would rate denser +responses higher while preferring accuracy during pairwise judgments. To our +surprise, we also observe that the choice of feedback protocol also has a +significant effect on the evaluation of aligned LLMs. In particular, we find +that LLMs that leverage rankings data for alignment (say model X) are preferred +over those that leverage ratings data (say model Y), with a rank-based +evaluation protocol (is X/Y's response better than reference response?) but not +with a rating-based evaluation protocol (score Rank X/Y's response on a scale +of 1-7). Our findings thus shed light on critical gaps in methods for +evaluating the real-world utility of language models and their strong +dependence on the feedback protocol used for alignment. Our code and data are +available at https://github.com/Hritikbansal/sparse_feedback. + +
+
+ comment: 24 pages, 12 Tables, 3 Figures +
+
+
+
+
+ + ☆ HAlf-MAsked Model for Named Entity Sentiment analysis + + +
+ Named Entity Sentiment analysis (NESA) is one of the most actively developing +application domains in Natural Language Processing (NLP). Social media NESA is +a significant field of opinion analysis since detecting and tracking sentiment +trends in the news flow is crucial for building various analytical systems and +monitoring the media image of specific people or companies. In this paper, we +study different transformers-based solutions NESA in RuSentNE-23 evaluation. +Despite the effectiveness of the BERT-like models, they can still struggle with +certain challenges, such as overfitting, which appeared to be the main obstacle +in achieving high accuracy on the RuSentNE-23 data. We present several +approaches to overcome this problem, among which there is a novel technique of +additional pass over given data with masked entity before making the final +prediction so that we can combine logits from the model when it knows the exact +entity it predicts sentiment for and when it does not. Utilizing this +technique, we ensemble multiple BERT- like models trained on different subsets +of data to improve overall performance. Our proposed model achieves the best +result on RuSentNE-23 evaluation data and demonstrates improved consistency in +entity-level sentiment analysis. + +
+
+
+
+
+ + ☆ Task-Based MoE for Multitask Multilingual Machine Translation + + +
+ Mixture-of-experts (MoE) architecture has been proven a powerful method for +diverse tasks in training deep models in many applications. However, current +MoE implementations are task agnostic, treating all tokens from different tasks +in the same manner. In this work, we instead design a novel method that +incorporates task information into MoE models at different granular levels with +shared dynamic task-based adapters. Our experiments and analysis show the +advantages of our approaches over the dense and canonical MoE models on +multi-task multilingual machine translations. With task-specific adapters, our +models can additionally generalize to new tasks efficiently. + +
+
+
+
+
+ + ☆ Cyberbullying Detection for Low-resource Languages and Dialects: Review + of the State of the Art + + +
+ The struggle of social media platforms to moderate content in a timely +manner, encourages users to abuse such platforms to spread vulgar or abusive +language, which, when performed repeatedly becomes cyberbullying a social +problem taking place in virtual environments, yet with real-world consequences, +such as depression, withdrawal, or even suicide attempts of its victims. +Systems for the automatic detection and mitigation of cyberbullying have been +developed but, unfortunately, the vast majority of them are for the English +language, with only a handful available for low-resource languages. To estimate +the present state of research and recognize the needs for further development, +in this paper we present a comprehensive systematic survey of studies done so +far for automatic cyberbullying detection in low-resource languages. We +analyzed all studies on this topic that were available. We investigated more +than seventy published studies on automatic detection of cyberbullying or +related language in low-resource languages and dialects that were published +between around 2017 and January 2023. There are 23 low-resource languages and +dialects covered by this paper, including Bangla, Hindi, Dravidian languages +and others. In the survey, we identify some of the research gaps of previous +studies, which include the lack of reliable definitions of cyberbullying and +its relevant subcategories, biases in the acquisition, and annotation of data. +Based on recognizing those research gaps, we provide some suggestions for +improving the general research conduct in cyberbullying detection, with a +primary focus on low-resource languages. Based on those proposed suggestions, +we collect and release a cyberbullying dataset in the Chittagonian dialect of +Bangla and propose a number of initial ML solutions trained on that dataset. In +addition, pre-trained transformer-based the BanglaBERT model was also +attempted. + +
+
+ comment: 52 Pages +
+
+
+
+
+ + ☆ Quantifying and Analyzing Entity-level Memorization in Large Language + Models + + +
+ Large language models (LLMs) have been proven capable of memorizing their +training data, which can be extracted through specifically designed prompts. As +the scale of datasets continues to grow, privacy risks arising from +memorization have attracted increasing attention. Quantifying language model +memorization helps evaluate potential privacy risks. However, prior works on +quantifying memorization require access to the precise original data or incur +substantial computational overhead, making it difficult for applications in +real-world language models. To this end, we propose a fine-grained, +entity-level definition to quantify memorization with conditions and metrics +closer to real-world scenarios. In addition, we also present an approach for +efficiently extracting sensitive entities from autoregressive language models. +We conduct extensive experiments based on the proposed, probing language +models' ability to reconstruct sensitive entities under different settings. We +find that language models have strong memorization at the entity level and are +able to reproduce the training data even with partial leakages. The results +demonstrate that LLMs not only memorize their training data but also understand +associations between entities. These findings necessitate that trainers of LLMs +exercise greater prudence regarding model memorization, adopting memorization +mitigation techniques to preclude privacy violations. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Optimizing Factual Accuracy in Text Generation through Dynamic Knowledge + Selection + + +
+ Language models (LMs) have revolutionized the way we interact with +information, but they often generate nonfactual text, raising concerns about +their reliability. Previous methods use external knowledge as references for +text generation to enhance factuality but often struggle with the knowledge +mix-up(e.g., entity mismatch) of irrelevant references. Besides,as the length +of the output text grows, the randomness of sampling can escalate, +detrimentally impacting the factual accuracy of the generated text. In this +paper, we present DKGen, which divide the text generation process into an +iterative process. In each iteration, DKGen takes the input query, the +previously generated text and a subset of the reference passages as input to +generate short text. During the process, the subset is dynamically selected +from the full passage set based on their relevance to the previously generated +text and the query, largely eliminating the irrelevant references from input. +To further enhance DKGen's ability to correctly use these external knowledge, +DKGen distills the relevance order of reference passages to the cross-attention +distribution of decoder. We train and evaluate DKGen on a large-scale benchmark +dataset. Experiment results show that DKGen outperforms all baseline models. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning + Based on Visually Grounded Conversations + + +
+ We introduce Affective Visual Dialog, an emotion explanation and reasoning +task as a testbed for research on understanding the formation of emotions in +visually grounded conversations. The task involves three skills: (1) +Dialog-based Question Answering (2) Dialog-based Emotion Prediction and (3) +Affective emotion explanation generation based on the dialog. Our key +contribution is the collection of a large-scale dataset, dubbed AffectVisDial, +consisting of 50K 10-turn visually grounded dialogs as well as concluding +emotion attributions and dialog-informed textual emotion explanations, +resulting in a total of 27,180 working hours. We explain our design decisions +in collecting the dataset and introduce the questioner and answerer tasks that +are associated with the participants in the conversation. We train and +demonstrate solid Affective Visual Dialog baselines adapted from +state-of-the-art models. Remarkably, the responses generated by our models show +promising emotional reasoning abilities in response to visually grounded +conversations. Our project page is available at +https://affective-visual-dialog.github.io. + +
+
+
+
+
+ + ☆ ToddlerBERTa: Exploiting BabyBERTa for Grammar Learning and Language + Understanding + + +
+ We present ToddlerBERTa, a BabyBERTa-like language model, exploring its +capabilities through five different models with varied hyperparameters. +Evaluating on BLiMP, SuperGLUE, MSGS, and a Supplement benchmark from the +BabyLM challenge, we find that smaller models can excel in specific tasks, +while larger models perform well with substantial data. Despite training on a +smaller dataset, ToddlerBERTa demonstrates commendable performance, rivalling +the state-of-the-art RoBERTa-base. The model showcases robust language +understanding, even with single-sentence pretraining, and competes with +baselines that leverage broader contextual information. Our work provides +insights into hyperparameter choices, and data utilization, contributing to the +advancement of language models. + +
+
+
+
+
+ + ♻ ☆ Large Language Models in Cryptocurrency Securities Cases: Can ChatGPT + Replace Lawyers? + + +
+ Large Language Models (LLMs) could enhance access to the legal system. +However, empirical research on their effectiveness in conducting legal tasks is +scant. We study securities cases involving cryptocurrencies as one of numerous +contexts where AI could support the legal process, studying LLMs' legal +reasoning and drafting capabilities. We examine whether a) an LLM can +accurately determine which laws are potentially being violated from a fact +pattern, and b) whether there is a difference in juror decision-making based on +complaints written by a lawyer compared to an LLM. We feed fact patterns from +real-life cases to GPT-3.5 and evaluate its ability to determine correct +potential violations from the scenario and exclude spurious violations. Second, +we had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's +legal reasoning skills proved weak, though we expect improvement in future +models, particularly given the violations it suggested tended to be correct (it +merely missed additional, correct violations). GPT-3.5 performed better at +legal drafting, and jurors' decisions were not statistically significantly +associated with the author of the document upon which they based their +decisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks, +they would be unable to replace lawyers at this stage. However, their drafting +skills (though, perhaps, still inferior to lawyers), could provide access to +justice for more individuals by reducing the cost of legal services. Our +research is the first to systematically study LLMs' legal drafting and +reasoning capabilities in litigation, as well as in securities law and +cryptocurrency-related misconduct. + +
+
+
+
+
+ + ♻ ☆ Going Beyond Nouns With Vision & Language Models Using Synthetic Data ICCV 2023 + + +
+ Large-scale pre-trained Vision & Language (VL) models have shown remarkable +performance in many applications, enabling replacing a fixed set of supported +classes with zero-shot open vocabulary reasoning over (almost arbitrary) +natural language prompts. However, recent works have uncovered a fundamental +weakness of these models. For example, their difficulty to understand Visual +Language Concepts (VLC) that go 'beyond nouns' such as the meaning of +non-object words (e.g., attributes, actions, relations, states, etc.), or +difficulty in performing compositional reasoning such as understanding the +significance of the order of the words in a sentence. In this work, we +investigate to which extent purely synthetic data could be leveraged to teach +these models to overcome such shortcomings without compromising their zero-shot +capabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale +synthetic dataset and data generation codebase allowing to generate additional +suitable data to improve VLC understanding and compositional reasoning of VL +models. Additionally, we propose a general VL finetuning strategy for +effectively leveraging SyViC towards achieving these improvements. Our +extensive experiments and ablations on VL-Checklist, Winoground, and ARO +benchmarks demonstrate that it is possible to adapt strong pre-trained VL +models with synthetic data significantly enhancing their VLC understanding +(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their +zero-shot accuracy. + +
+
+ comment: Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/ +
+
+
+
+
+ + ♻ ☆ Evaluating GPT-3 Generated Explanations for Hateful Content Moderation IJCAI + + +
+ Recent research has focused on using large language models (LLMs) to generate +explanations for hate speech through fine-tuning or prompting. Despite the +growing interest in this area, these generated explanations' effectiveness and +potential limitations remain poorly understood. A key concern is that these +explanations, generated by LLMs, may lead to erroneous judgments about the +nature of flagged content by both users and content moderators. For instance, +an LLM-generated explanation might inaccurately convince a content moderator +that a benign piece of content is hateful. In light of this, we propose an +analytical framework for examining hate speech explanations and conducted an +extensive survey on evaluating such explanations. Specifically, we prompted +GPT-3 to generate explanations for both hateful and non-hateful content, and a +survey was conducted with 2,400 unique respondents to evaluate the generated +explanations. Our findings reveal that (1) human evaluators rated the +GPT-generated explanations as high quality in terms of linguistic fluency, +informativeness, persuasiveness, and logical soundness, (2) the persuasive +nature of these explanations, however, varied depending on the prompting +strategy employed, and (3) this persuasiveness may result in incorrect +judgments about the hatefulness of the content. Our study underscores the need +for caution in applying LLM-generated explanations for content moderation. Code +and results are available at https://github.com/Social-AI-Studio/GPT3-HateEval. + +
+
+ comment: 9 pages, 2 figures, Accepted by International Joint Conference on + Artificial Intelligence(IJCAI) +
+
+
+
+
+ + ♻ ☆ Effect of Attention and Self-Supervised Speech Embeddings on + Non-Semantic Speech Tasks + + +
+ Human emotion understanding is pivotal in making conversational technology +mainstream. We view speech emotion understanding as a perception task which is +a more realistic setting. With varying contexts (languages, demographics, etc.) +different share of people perceive the same speech segment as a non-unanimous +emotion. As part of the ACM Multimedia 2023 Computational Paralinguistics +ChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset +of multilingual speakers and multi-label regression target of 'emotion share' +or perception of that emotion. We demonstrate that the training scheme of +different foundation models dictates their effectiveness for tasks beyond +speech recognition, especially for non-semantic speech tasks like emotion +understanding. This is a very complex task due to multilingual speakers, +variability in the target labels, and inherent imbalance in the regression +dataset. Our results show that HuBERT-Large with a self-attention-based +light-weight sequence model provides 4.6% improvement over the reported +baseline. + +
+
+ comment: Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges + Track +
+
+
+
+
+ + ♻ ☆ Context-VQA: Towards Context-Aware and Purposeful Visual Question + Answering ICCV 2023 + + +
+ Visual question answering (VQA) has the potential to make the Internet more +accessible in an interactive way, allowing people who cannot see images to ask +questions about them. However, multiple studies have shown that people who are +blind or have low-vision prefer image explanations that incorporate the context +in which an image appears, yet current VQA datasets focus on images in +isolation. We argue that VQA models will not fully succeed at meeting people's +needs unless they take context into account. To further motivate and analyze +the distinction between different contexts, we introduce Context-VQA, a VQA +dataset that pairs images with contexts, specifically types of websites (e.g., +a shopping website). We find that the types of questions vary systematically +across contexts. For example, images presented in a travel context garner 2 +times more "Where?" questions, and images on social media and news garner 2.8 +and 1.8 times more "Who?" questions than the average. We also find that context +effects are especially important when participants can't see the image. These +results demonstrate that context affects the types of questions asked and that +VQA models should be context-sensitive to better meet people's needs, +especially in accessibility settings. + +
+
+ comment: Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision + and Language +
+
+
+
+
+ + ♻ ☆ MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with + Transformers + + +
+ Message Passing Interface (MPI) plays a crucial role in distributed memory +parallelization across multiple nodes. However, parallelizing MPI code +manually, and specifically, performing domain decomposition, is a challenging, +error-prone task. In this paper, we address this problem by developing +MPI-RICAL, a novel data-driven, programming-assistance tool that assists +programmers in writing domain decomposition based distributed memory +parallelization code. Specifically, we train a supervised language model to +suggest MPI functions and their proper locations in the code on the fly. We +also introduce MPICodeCorpus, the first publicly available corpus of MPI-based +parallel programs that is created by mining more than 15,000 open-source +repositories on GitHub. Experimental results have been done on MPICodeCorpus +and more importantly, on a compiled benchmark of MPI-based parallel programs +for numerical computations that represent real-world scientific applications. +MPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating +its accuracy in suggesting correct MPI functions at appropriate code +locations.. The source code used in this work, as well as other relevant +sources, are available at: +https://github.com/Scientific-Computing-Lab-NRCN/MPI-rical + +
+
+
+
+
+ + ♻ ☆ LibriSQA: Advancing Free-form and Open-ended Spoken Question Answering + with a Novel Dataset and Framework + + +
+ While Large Language Models (LLMs) have demonstrated commendable performance +across a myriad of domains and tasks, existing LLMs still exhibit a palpable +deficit in handling multimodal functionalities, especially for the Spoken +Question Answering (SQA) task which necessitates precise alignment and deep +interaction between speech and text features. To address the SQA challenge on +LLMs, we initially curated the free-form and open-ended LibriSQA dataset from +Librispeech, comprising Part I with natural conversational formats and Part II +encompassing multiple-choice questions followed by answers and analytical +segments. Both parts collectively include 107k SQA pairs that cover various +topics. Given the evident paucity of existing speech-text LLMs, we propose a +lightweight, end-to-end framework to execute the SQA task on the LibriSQA, +witnessing significant results. By reforming ASR into the SQA format, we +further substantiate our framework's capability in handling ASR tasks. Our +empirical findings bolster the LLMs' aptitude for aligning and comprehending +multimodal information, paving the way for the development of universal +multimodal LLMs. The dataset and demo can be found at +https://github.com/ZihanZhaoSJTU/LibriSQA. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are not Fair Evaluators + + +
+ In this paper, we uncover a systematic bias in the evaluation paradigm of +adopting large language models~(LLMs), e.g., GPT-4, as a referee to score and +compare the quality of responses generated by candidate models. We find that +the quality ranking of candidate responses can be easily hacked by simply +altering their order of appearance in the context. This manipulation allows us +to skew the evaluation result, making one model appear considerably superior to +the other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries +with ChatGPT as an evaluator. To address this issue, we propose a calibration +framework with three simple yet effective strategies: 1) Multiple Evidence +Calibration, which requires the evaluator model to generate multiple evaluation +evidence before assigning ratings; 2) Balanced Position Calibration, which +aggregates results across various orders to determine the final score; 3) +Human-in-the-Loop Calibration, which introduces a balanced position diversity +entropy to measure the difficulty of each example and seeks human assistance +when needed. We also manually annotate the "win/tie/lose" outcomes of responses +from ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and +extensive experiments demonstrate that our approach successfully mitigates +evaluation bias, resulting in closer alignment with human judgments. We release +our code and human annotation at \url{https://github.com/i-Eval/FairEval} to +facilitate future research. + +
+
+
+
+
+ + ♻ ☆ Formal specification terminology for demographic agent-based models of + fixed-step single-clocked simulations + + +
+ This document presents adequate formal terminology for the mathematical +specification of a subset of Agent Based Models (ABMs) in the field of +Demography. The simulation of the targeted ABMs follows a fixed-step +single-clocked pattern. The proposed terminology further improves the model +understanding and can act as a stand-alone methodology for the specification +and optionally the documentation of a significant set of (demographic) ABMs. +Nevertheless, it is imaginable the this terminology probably with further +extensions can be merged with the largely-informal widely-used model +documentation and communication O.D.D. protocol [Grimm and et al., 2020, +Amouroux et al., 2010] to reduce many sources of ambiguity, hindering model +replications by other modelers. A published demographic model documentation, +largely simplified version of the Lone Parent Model [Gostoli and Silverman, +2020] is separately published in [Elsheikh, 2023b] as illustration for the +formal terminology. The model was implemented in the Julia language [Elsheikh, +2023a] based on the Agents.jl julia package [Datseris et al., 2022]. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2307.16548 +
+
+
+
+
+ + ♻ ☆ FurChat: An Embodied Conversational Agent using LLMs, Combining Open and + Closed-Domain Dialogue with Facial Expressions SIGDIAL 2023 + + +
+ We demonstrate an embodied conversational agent that can function as a +receptionist and generate a mixture of open and closed-domain dialogue along +with facial expressions, by using a large language model (LLM) to develop an +engaging conversation. We deployed the system onto a Furhat robot, which is +highly expressive and capable of using both verbal and nonverbal cues during +interaction. The system was designed specifically for the National Robotarium +to interact with visitors through natural conversations, providing them with +information about the facilities, research, news, upcoming events, etc. The +system utilises the state-of-the-art GPT-3.5 model to generate such information +along with domain-general conversations and facial expressions based on prompt +engineering. + +
+
+ comment: 5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the + Special Interest Group on Discourse and Dialogue), for the demo video, see + https://youtu.be/fwtUl1kl22s +
+
+
+
+
+ + ♻ ☆ CLSE: Corpus of Linguistically Significant Entities EMNLP 2022 + + +
+ One of the biggest challenges of natural language generation (NLG) is the +proper handling of named entities. Named entities are a common source of +grammar mistakes such as wrong prepositions, wrong article handling, or +incorrect entity inflection. Without factoring linguistic representation, such +errors are often underrepresented when evaluating on a small set of arbitrarily +picked argument values, or when translating a dataset from a linguistically +simpler language, like English, to a linguistically complex language, like +Russian. However, for some applications, broadly precise grammatical +correctness is critical -- native speakers may find entity-related grammar +errors silly, jarring, or even offensive. + To enable the creation of more linguistically diverse NLG datasets, we +release a Corpus of Linguistically Significant Entities (CLSE) annotated by +linguist experts. The corpus includes 34 languages and covers 74 different +semantic types to support various applications from airline ticketing to video +games. To demonstrate one possible use of CLSE, we produce an augmented version +of the Schema-Guided Dialog Dataset, SGD-CLSE. Using the CLSE's entities and a +small number of human translations, we create a linguistically representative +NLG evaluation benchmark in three languages: French (high-resource), Marathi +(low-resource), and Russian (highly inflected language). We establish quality +baselines for neural, template-based, and hybrid NLG systems and discuss the +strengths and weaknesses of each approach. + +
+
+ comment: Proceedings of the 2nd Workshop on Natural Language Generation, + Evaluation, and Metrics (GEM 2022) at EMNLP 2022 +
+
+
+
+
+ + ♻ ☆ Red-Teaming Large Language Models using Chain of Utterances for + Safety-Alignment + + +
+ Larger language models (LLMs) have taken the world by storm with their +massive multi-tasking capabilities simply by optimizing over a next-word +prediction objective. With the emergence of their properties and encoded +knowledge, the risk of LLMs producing harmful outputs increases, making them +unfit for scalable deployment for the public. In this work, we propose a new +safety evaluation benchmark RED-EVAL that carries out red-teaming. We show that +even widely deployed models are susceptible to the Chain of Utterances-based +(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and +ChatGPT to unethically respond to more than 65% and 73% of harmful queries. We +also demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in +generating harmful responses in more than 86% of the red-teaming attempts. +Next, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It +constitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting, +we collect a dataset that consists of 1.9K harmful questions covering a wide +range of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2) +SAFE-ALIGN: We demonstrate how the conversational dataset can be used for the +safety alignment of LLMs by minimizing the negative log-likelihood over helpful +responses and penalizing over harmful responses by gradient accent over sample +loss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely +aligned when evaluated on RED-EVAL and HHH benchmarks while preserving the +utility of the baseline models (TruthfulQA, MMLU, and BBH). + +
+
+
+
+
+ + ♻ ☆ SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge + Distillation from BERT + + +
+ Spiking neural networks (SNNs) offer a promising avenue to implement deep +neural networks in a more energy-efficient way. However, the network +architectures of existing SNNs for language tasks are too simplistic, and deep +architectures have not been fully explored, resulting in a significant +performance gap compared to mainstream transformer-based networks such as BERT. +To this end, we improve a recently-proposed spiking transformer (i.e., +Spikformer) to make it possible to process language tasks and propose a +two-stage knowledge distillation method for training it, which combines +pre-training by distilling knowledge from BERT with a large collection of +unlabelled texts and fine-tuning with task-specific instances via knowledge +distillation again from the BERT fine-tuned on the same training examples. +Through extensive experimentation, we show that the models trained with our +method, named SpikeBERT, outperform state-of-the-art SNNs and even achieve +comparable results to BERTs on text classification tasks for both English and +Chinese with much less energy consumption. + +
+
+
+
+
+ + ♻ ☆ Reliable Natural Language Understanding with Large Language Models and + Answer Set Programming + + +
+ Humans understand language by extracting information (meaning) from +sentences, combining it with existing commonsense knowledge, and then +performing reasoning to draw conclusions. While large language models (LLMs) +such as GPT-3 and ChatGPT are able to leverage patterns in the text to solve a +variety of NLP tasks, they fall short in problems that require reasoning. They +also cannot reliably explain the answers generated for a given question. In +order to emulate humans better, we propose STAR, a framework that combines LLMs +with Answer Set Programming (ASP). We show how LLMs can be used to effectively +extract knowledge -- represented as predicates -- from language. Goal-directed +ASP is then employed to reliably reason over this knowledge. We apply the STAR +framework to three different NLU tasks requiring reasoning: qualitative +reasoning, mathematical reasoning, and goal-directed conversation. Our +experiments reveal that STAR is able to bridge the gap of reasoning in NLU +tasks, leading to significant performance improvements, especially for smaller +LLMs, i.e., LLMs with a smaller number of parameters. NLU applications +developed using the STAR framework are also explainable: along with the +predicates generated, a justification in the form of a proof tree can be +produced for a given output. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ♻ ☆ A Survey of Knowledge Enhanced Pre-trained Language Models + + +
+ Pre-trained Language Models (PLMs) which are trained on large text corpus via +self-supervised learning method, have yielded promising performance on various +tasks in Natural Language Processing (NLP). However, though PLMs with huge +parameters can effectively possess rich knowledge learned from massive training +text and benefit downstream tasks at the fine-tuning stage, they still have +some limitations such as poor reasoning ability due to the lack of external +knowledge. Research has been dedicated to incorporating knowledge into PLMs to +tackle these issues. In this paper, we present a comprehensive review of +Knowledge Enhanced Pre-trained Language Models (KE-PLMs) to provide a clear +insight into this thriving field. We introduce appropriate taxonomies +respectively for Natural Language Understanding (NLU) and Natural Language +Generation (NLG) to highlight these two main tasks of NLP. For NLU, we divide +the types of knowledge into four categories: linguistic knowledge, text +knowledge, knowledge graph (KG), and rule knowledge. The KE-PLMs for NLG are +categorized into KG-based and retrieval-based methods. Finally, we point out +some promising future directions of KE-PLMs. + +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Adapting Text-based Dialogue State Tracker for Spoken Dialogues SIGDIAL 2023 + + +
+ Although there have been remarkable advances in dialogue systems through the +dialogue systems technology competition (DSTC), it remains one of the key +challenges to building a robust task-oriented dialogue system with a speech +interface. Most of the progress has been made for text-based dialogue systems +since there are abundant datasets with written corpora while those with spoken +dialogues are very scarce. However, as can be seen from voice assistant systems +such as Siri and Alexa, it is of practical importance to transfer the success +to spoken dialogues. In this paper, we describe our engineering effort in +building a highly successful model that participated in the speech-aware +dialogue systems technology challenge track in DSTC11. Our model consists of +three major modules: (1) automatic speech recognition error correction to +bridge the gap between the spoken and the text utterances, (2) text-based +dialogue system (D3ST) for estimating the slots and values using slot +descriptions, and (3) post-processing for recovering the error of the estimated +slot value. Our experiments show that it is important to use an explicit +automatic speech recognition error correction module, post-processing, and data +augmentation to adapt a text-based dialogue state tracker for spoken dialogue +corpora. + +
+
+ comment: 8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at + SIGDIAL 2023 +
+
+
+
+
+ + ♻ ☆ Marshall-Olkin Power-Law Distributions in Length-Frequency of Entities + + +
+ Entities involve important concepts with concrete meanings and play important +roles in numerous linguistic tasks. Entities have different forms in different +linguistic tasks and researchers treat those different forms as different +concepts. In this paper, we are curious to know whether there are some common +characteristics that connect those different forms of entities. Specifically, +we investigate the underlying distributions of entities from different types +and different languages, trying to figure out some common characteristics +behind those diverse entities. After analyzing twelve datasets about different +types of entities and eighteen datasets about entities in different languages, +we find that while these entities are dramatically diverse from each other in +many aspects, their length-frequencies can be well characterized by a family of +Marshall-Olkin power-law (MOPL) distributions. We conduct experiments on those +thirty datasets about entities in different types and different languages, and +experimental results demonstrate that MOPL models characterize the +length-frequencies of entities much better than two state-of-the-art power-law +models and an alternative log-normal model. Experimental results also +demonstrate that MOPL models are scalable to the length-frequency of entities +in large-scale real-world datasets. + +
+
+ comment: 33 pages, 3 figures (30 subfigures), 8 tables. To appear in + Knowledge-Based Systems +
+
+
+
+
+ + ♻ ☆ Automatically Correcting Large Language Models: Surveying the landscape + of diverse self-correction strategies + + +
+ Large language models (LLMs) have demonstrated remarkable performance across +a wide array of NLP tasks. However, their efficacy is undermined by undesired +and inconsistent behaviors, including hallucination, unfaithful reasoning, and +toxic content. A promising approach to rectify these flaws is self-correction, +where the LLM itself is prompted or guided to fix problems in its own output. +Techniques leveraging automated feedback -- either produced by the LLM itself +or some external system -- are of particular interest as they are a promising +way to make LLM-based solutions more practical and deployable with minimal +human feedback. This paper presents a comprehensive review of this emerging +class of techniques. We analyze and taxonomize a wide array of recent work +utilizing these strategies, including training-time, generation-time, and +post-hoc correction. We also summarize the major applications of this strategy +and conclude by discussing future directions and challenges. + +
+
+ comment: Work in Progress. Version 2 +
+
+
+
+
+ + ♻ ☆ WeaverBird: Empowering Financial Decision-Making with Large Language + Model, Knowledge Base, and Search Engine + + +
+ We present WeaverBird, an intelligent dialogue system designed specifically +for the finance domain. Our system harnesses a large language model of GPT +architecture that has been tuned using extensive corpora of finance-related +text. As a result, our system possesses the capability to understand complex +financial queries, such as "How should I manage my investments during +inflation?", and provide informed responses. Furthermore, our system +incorporates a local knowledge base and a search engine to retrieve relevant +information. The final responses are conditioned on the search results and +include proper citations to the sources, thus enjoying an enhanced credibility. +Through a range of finance-related questions, we have demonstrated the superior +performance of our system compared to other models. To experience our system +firsthand, users can interact with our live demo at +https://weaverbird.ttic.edu, as well as watch our 2-min video illustration at +https://www.youtube.com/watch?v=fyV2qQkX6Tc. + +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ SCOTT: Self-Consistent Chain-of-Thought Distillation ACL 2023 + + +
+ Large language models (LMs) beyond a certain scale, demonstrate the emergent +capability of generating free-text rationales for their predictions via +chain-of-thought (CoT) prompting. While CoT can yield dramatically improved +performance, such gains are only observed for sufficiently large LMs. Even more +concerning, there is little guarantee that the generated rationales are +consistent with LM's predictions or faithfully justify the decisions. In this +work, we propose a faithful knowledge distillation method to learn a small, +self-consistent CoT model from a teacher model that is orders of magnitude +larger. To form better supervision, we elicit rationales supporting the gold +answers from a large LM (teacher) by contrastive decoding, which encourages the +teacher to generate tokens that become more plausible only when the answer is +considered. To ensure faithful distillation, we use the teacher-generated +rationales to learn a student LM with a counterfactual reasoning objective, +which prevents the student from ignoring the rationales to make inconsistent +predictions. Experiments show that, while yielding comparable end-task +performance, our method can generate CoT rationales that are more faithful than +baselines do. Further analysis suggests that such a model respects the +rationales more when making decisions; thus, we can improve its performance +more by refining its rationales. + +
+
+ comment: 11 pages, 8 figures. Accepted to ACL 2023 +
+
+
+
+
+ + ♻ ☆ Automatic Design of Semantic Similarity Ensembles Using Grammatical + Evolution + + +
+ Semantic similarity measures are widely used in natural language processing +to catalyze various computer-related tasks. However, no single semantic +similarity measure is the most appropriate for all tasks, and researchers often +use ensemble strategies to ensure performance. This research work proposes a +method for automatically designing semantic similarity ensembles. In fact, our +proposed method uses grammatical evolution, for the first time, to +automatically select and aggregate measures from a pool of candidates to create +an ensemble that maximizes correlation to human judgment. The method is +evaluated on several benchmark datasets and compared to state-of-the-art +ensembles, showing that it can significantly improve similarity assessment +accuracy and outperform existing methods in some cases. As a result, our +research demonstrates the potential of using grammatical evolution to +automatically compare text and prove the benefits of using ensembles for +semantic similarity tasks. The source code that illustrates our approach can be +downloaded from https://github.com/jorge-martinez-gil/sesige. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ DALL-Eval: Probing the Reasoning Skills and Social Biases of + Text-to-Image Generation Models ICCV 2023 + + +
+ Recently, DALL-E, a multimodal transformer language model, and its variants, +including diffusion models, have shown high-quality text-to-image generation +capabilities. However, despite the realistic image generation results, there +has not been a detailed analysis of how to evaluate such models. In this work, +we investigate the visual reasoning capabilities and social biases of different +text-to-image models, covering both multimodal transformer language models and +diffusion models. First, we measure three visual reasoning skills: object +recognition, object counting, and spatial relation understanding. For this, we +propose PaintSkills, a compositional diagnostic evaluation dataset that +measures these skills. Despite the high-fidelity image generation capability, a +large gap exists between the performance of recent models and the upper bound +accuracy in object counting and spatial relation understanding skills. Second, +we assess the gender and skin tone biases by measuring the gender/skin tone +distribution of generated images across various professions and attributes. We +demonstrate that recent text-to-image generation models learn specific biases +about gender and skin tone from web image-text pairs. We hope our work will +help guide future progress in improving text-to-image generation models on +visual reasoning skills and learning socially unbiased representations. Code +and data: https://github.com/j-min/DallEval + +
+
+ comment: ICCV 2023 (34 pages; see appendix for version changelog) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 129 + +
+
+
+ + ☆ Boosting Detection in Crowd Analysis via Underutilized Output Features + + +
+ Detection-based methods have been viewed unfavorably in crowd analysis due to +their poor performance in dense crowds. However, we argue that the potential of +these methods has been underestimated, as they offer crucial information for +crowd analysis that is often ignored. Specifically, the area size and +confidence score of output proposals and bounding boxes provide insight into +the scale and density of the crowd. To leverage these underutilized features, +we propose Crowd Hat, a plug-and-play module that can be easily integrated with +existing detection models. This module uses a mixed 2D-1D compression technique +to refine the output features and obtain the spatial and numerical distribution +of crowd-specific information. Based on these features, we further propose +region-adaptive NMS thresholds and a decouple-then-align paradigm that address +the major limitations of detection-based methods. Our extensive evaluations on +various crowd analysis tasks, including crowd counting, localization, and +detection, demonstrate the effectiveness of utilizing output features and the +potential of detection-based methods in crowd analysis. + +
+
+ comment: project page: https://fredfyyang.github.io/Crowd-Hat/ +
+
+
+
+
+ + ☆ SAM-Med2D + + +
+ The Segment Anything Model (SAM) represents a state-of-the-art research +advancement in natural image segmentation, achieving impressive results with +input prompts such as points and bounding boxes. However, our evaluation and +recent research indicate that directly applying the pretrained SAM to medical +image segmentation does not yield satisfactory performance. This limitation +primarily arises from significant domain gap between natural images and medical +images. To bridge this gap, we introduce SAM-Med2D, the most comprehensive +studies on applying SAM to medical 2D images. Specifically, we first collect +and curate approximately 4.6M images and 19.7M masks from public and private +datasets, constructing a large-scale medical image segmentation dataset +encompassing various modalities and objects. Then, we comprehensively fine-tune +SAM on this dataset and turn it into SAM-Med2D. Unlike previous methods that +only adopt bounding box or point prompts as interactive segmentation approach, +we adapt SAM to medical image segmentation through more comprehensive prompts +involving bounding boxes, points, and masks. We additionally fine-tune the +encoder and decoder of the original SAM to obtain a well-performed SAM-Med2D, +leading to the most comprehensive fine-tuning strategies to date. Finally, we +conducted a comprehensive evaluation and analysis to investigate the +performance of SAM-Med2D in medical image segmentation across various +modalities, anatomical structures, and organs. Concurrently, we validated the +generalization capability of SAM-Med2D on 9 datasets from MICCAI 2023 +challenge. Overall, our approach demonstrated significantly superior +performance and generalization capability compared to SAM. + +
+
+
+
+
+ + ☆ GREC: Generalized Referring Expression Comprehension + + +
+ The objective of Classic Referring Expression Comprehension (REC) is to +produce a bounding box corresponding to the object mentioned in a given textual +description. Commonly, existing datasets and techniques in classic REC are +tailored for expressions that pertain to a single target, meaning a sole +expression is linked to one specific object. Expressions that refer to multiple +targets or involve no specific target have not been taken into account. This +constraint hinders the practical applicability of REC. This study introduces a +new benchmark termed as Generalized Referring Expression Comprehension (GREC). +This benchmark extends the classic REC by permitting expressions to describe +any number of target objects. To achieve this goal, we have built the first +large-scale GREC dataset named gRefCOCO. This dataset encompasses a range of +expressions: those referring to multiple targets, expressions with no specific +target, and the single-target expressions. The design of GREC and gRefCOCO +ensures smooth compatibility with classic REC. The proposed gRefCOCO dataset, a +GREC method implementation code, and GREC evaluation code are available at +https://github.com/henghuiding/gRefCOCO. + +
+
+ comment: GREC Technical Report, Project Page: + https://henghuiding.github.io/GRES +
+
+
+
+
+ + ☆ MMVP: Motion-Matrix-based Video Prediction ICCV 2023 + + +
+ A central challenge of video prediction lies where the system has to reason +the objects' future motions from image frames while simultaneously maintaining +the consistency of their appearances across frames. This work introduces an +end-to-end trainable two-stream video prediction framework, Motion-Matrix-based +Video Prediction (MMVP), to tackle this challenge. Unlike previous methods that +usually handle motion prediction and appearance maintenance within the same set +of modules, MMVP decouples motion and appearance information by constructing +appearance-agnostic motion matrices. The motion matrices represent the temporal +similarity of each and every pair of feature patches in the input frames, and +are the sole input of the motion prediction module in MMVP. This design +improves video prediction in both accuracy and efficiency, and reduces the +model size. Results of extensive experiments demonstrate that MMVP outperforms +state-of-the-art systems on public data sets by non-negligible large margins +(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the +size or smaller). Please refer to +https://github.com/Kay1794/MMVP-motion-matrix-based-video-prediction for the +official code and the datasets used in this paper. + +
+
+ comment: ICCV 2023 (Oral) +
+
+
+
+
+ + ☆ Modality Cycles with Masked Conditional Diffusion for Unsupervised + Anomaly Segmentation in MRI MICCAI + 2023 + + +
+ Unsupervised anomaly segmentation aims to detect patterns that are distinct +from any patterns processed during training, commonly called abnormal or +out-of-distribution patterns, without providing any associated manual +segmentations. Since anomalies during deployment can lead to model failure, +detecting the anomaly can enhance the reliability of models, which is valuable +in high-risk domains like medical imaging. This paper introduces Masked +Modality Cycles with Conditional Diffusion (MMCCD), a method that enables +segmentation of anomalies across diverse patterns in multimodal MRI. The method +is based on two fundamental ideas. First, we propose the use of cyclic modality +translation as a mechanism for enabling abnormality detection. +Image-translation models learn tissue-specific modality mappings, which are +characteristic of tissue physiology. Thus, these learned mappings fail to +translate tissues or image patterns that have never been encountered during +training, and the error enables their segmentation. Furthermore, we combine +image translation with a masked conditional diffusion model, which attempts to +`imagine' what tissue exists under a masked area, further exposing unknown +patterns as the generative model fails to recreate them. We evaluate our method +on a proxy task by training on healthy-looking slices of BraTS2021 +multi-modality MRIs and testing on slices with tumors. We show that our method +compares favorably to previous unsupervised approaches based on image +reconstruction and denoising with autoencoders and diffusion models. + +
+
+ comment: Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI + 2023 +
+
+
+
+
+ + ☆ CircleFormer: Circular Nuclei Detection in Whole Slide Images with + Circle Queries and Attention MICCAI 2023 + + +
+ Both CNN-based and Transformer-based object detection with bounding box +representation have been extensively studied in computer vision and medical +image analysis, but circular object detection in medical images is still +underexplored. Inspired by the recent anchor free CNN-based circular object +detection method (CircleNet) for ball-shape glomeruli detection in renal +pathology, in this paper, we present CircleFormer, a Transformer-based circular +medical object detection with dynamic anchor circles. Specifically, queries +with circle representation in Transformer decoder iteratively refine the +circular object detection results, and a circle cross attention module is +introduced to compute the similarity between circular queries and image +features. A generalized circle IoU (gCIoU) is proposed to serve as a new +regression loss of circular object detection as well. Moreover, our approach is +easy to generalize to the segmentation task by adding a simple segmentation +branch to CircleFormer. We evaluate our method in circular nuclei detection and +segmentation on the public MoNuSeg dataset, and the experimental results show +that our method achieves promising performance compared with the +state-of-the-art approaches. The effectiveness of each component is validated +via ablation studies as well. Our code is released at: +\url{https://github.com/zhanghx-iim-ahu/CircleFormer}. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ CorrEmbed: Evaluating Pre-trained Model Image Similarity Efficacy with a + Novel Metric + + +
+ Detecting visually similar images is a particularly useful attribute to look +to when calculating product recommendations. Embedding similarity, which +utilizes pre-trained computer vision models to extract high-level image +features, has demonstrated remarkable efficacy in identifying images with +similar compositions. However, there is a lack of methods for evaluating the +embeddings generated by these models, as conventional loss and performance +metrics do not adequately capture their performance in image similarity search +tasks. + In this paper, we evaluate the viability of the image embeddings from +numerous pre-trained computer vision models using a novel approach named +CorrEmbed. Our approach computes the correlation between distances in image +embeddings and distances in human-generated tag vectors. We extensively +evaluate numerous pre-trained Torchvision models using this metric, revealing +an intuitive relationship of linear scaling between ImageNet1k accuracy scores +and tag-correlation scores. Importantly, our method also identifies deviations +from this pattern, providing insights into how different models capture +high-level image features. + By offering a robust performance evaluation of these pre-trained models, +CorrEmbed serves as a valuable tool for researchers and practitioners seeking +to develop effective, data-driven approaches to similar item recommendations in +fashion retail. + +
+
+ comment: Accepted to AI-2023 Forty-third SGAI International Conference on + Artificial Intelligence +
+
+
+
+
+ + ☆ Improving Few-shot Image Generation by Structural Discrimination and + Textural Modulation ACM MM 2023 + + +
+ Few-shot image generation, which aims to produce plausible and diverse images +for one category given a few images from this category, has drawn extensive +attention. Existing approaches either globally interpolate different images or +fuse local representations with pre-defined coefficients. However, such an +intuitive combination of images/features only exploits the most relevant +information for generation, leading to poor diversity and coarse-grained +semantic fusion. To remedy this, this paper proposes a novel textural +modulation (TexMod) mechanism to inject external semantic signals into internal +local representations. Parameterized by the feedback from the discriminator, +our TexMod enables more fined-grained semantic injection while maintaining the +synthesis fidelity. Moreover, a global structural discriminator (StructD) is +developed to explicitly guide the model to generate images with reasonable +layout and outline. Furthermore, the frequency awareness of the model is +reinforced by encouraging the model to distinguish frequency signals. Together +with these techniques, we build a novel and effective model for few-shot image +generation. The effectiveness of our model is identified by extensive +experiments on three popular datasets and various settings. Besides achieving +state-of-the-art synthesis performance on these datasets, our proposed +techniques could be seamlessly integrated into existing models for a further +performance boost. + +
+
+ comment: To appear in ACM MM 2023, code is available at + https://github.com/kobeshegu/SDTM-GAN-ACMMM-2023 +
+
+
+
+
+ + ☆ Learned Image Reasoning Prior Penetrates Deep Unfolding Network for + Panchromatic and Multi-Spectral Image Fusion ICCV 2023 + + +
+ The success of deep neural networks for pan-sharpening is commonly in a form +of black box, lacking transparency and interpretability. To alleviate this +issue, we propose a novel model-driven deep unfolding framework with image +reasoning prior tailored for the pan-sharpening task. Different from existing +unfolding solutions that deliver the proximal operator networks as the +uncertain and vague priors, our framework is motivated by the content reasoning +ability of masked autoencoders (MAE) with insightful designs. Specifically, the +pre-trained MAE with spatial masking strategy, acting as intrinsic reasoning +prior, is embedded into unfolding architecture. Meanwhile, the pre-trained MAE +with spatial-spectral masking strategy is treated as the regularization term +within loss function to constrain the spatial-spectral consistency. Such +designs penetrate the image reasoning prior into deep unfolding networks while +improving its interpretability and representation capability. The uniqueness of +our framework is that the holistic learning process is explicitly integrated +with the inherent physical mechanism underlying the pan-sharpening task. +Extensive experiments on multiple satellite datasets demonstrate the +superiority of our method over the existing state-of-the-art approaches. Code +will be released at \url{https://manman1995.github.io/}. + +
+
+ comment: 10 pages; Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ SignDiff: Learning Diffusion Models for American Sign Language + Production + + +
+ The field of Sign Language Production (SLP) lacked a large-scale, pre-trained +model based on deep learning for continuous American Sign Language (ASL) +production in the past decade. This limitation hampers communication for all +individuals with disabilities relying on ASL. To address this issue, we +undertook the secondary development and utilization of How2Sign, one of the +largest publicly available ASL datasets. Despite its significance, prior +researchers in the field of sign language have not effectively employed this +corpus due to the intricacies involved in American Sign Language Production +(ASLP). + To conduct large-scale ASLP, we propose SignDiff based on the latest work in +related fields, which is a dual-condition diffusion pre-training model that can +generate human sign language speakers from a skeleton pose. SignDiff has a +novel Frame Reinforcement Network called FR-Net, similar to dense human pose +estimation work, which enhances the correspondence between text lexical symbols +and sign language dense pose frames reduce the occurrence of multiple fingers +in the diffusion model. In addition, our ASLP method proposes two new improved +modules and a new loss function to improve the accuracy and quality of sign +language skeletal posture and enhance the ability of the model to train on +large-scale data. + We propose the first baseline for ASL production and report the scores of +17.19 and 12.85 on BLEU-4 on the How2Sign dev/test sets. We also evaluated our +model on the previous mainstream dataset called PHOENIX14T, and the main +experiments achieved the results of SOTA. In addition, our image quality far +exceeds all previous results by 10 percentage points on the SSIM indicator. +Finally, we conducted ablation studies and qualitative evaluations for +discussion. + +
+
+
+
+
+ + ☆ Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for + English to Indian Languages + + +
+ The study investigates the effectiveness of utilizing multimodal information +in Neural Machine Translation (NMT). While prior research focused on using +multimodal data in low-resource scenarios, this study examines how image +features impact translation when added to a large-scale, pre-trained unimodal +NMT system. Surprisingly, the study finds that images might be redundant in +this context. Additionally, the research introduces synthetic noise to assess +whether images help the model deal with textual noise. Multimodal models +slightly outperform text-only models in noisy settings, even with random +images. The study's experiments translate from English to Hindi, Bengali, and +Malayalam, outperforming state-of-the-art benchmarks significantly. +Interestingly, the effect of visual context varies with source text noise: no +visual context works best for non-noisy translations, cropped image features +are optimal for low noise, and full image features work better in high-noise +scenarios. This sheds light on the role of visual context, especially in noisy +settings, opening up a new research direction for Noisy Neural Machine +Translation in multimodal setups. The research emphasizes the importance of +combining visual and textual information for improved translation in various +environments. + +
+
+
+
+
+ + ☆ Semantic Image Synthesis via Class-Adaptive Cross-Attention + + +
+ In semantic image synthesis, the state of the art is dominated by methods +that use spatially-adaptive normalization layers, which allow for excellent +visual generation quality and editing versatility. Granted their efficacy, +recent research efforts have focused toward finer-grained local style control +and multi-modal generation. By construction though, such layers tend to +overlook global image statistics leading to unconvincing local style editing +and causing global inconsistencies such as color or illumination distribution +shifts. Also, the semantic layout is required for mapping styles in the +generator, putting a strict alignment constraint over the features. In +response, we designed a novel architecture where cross-attention layers are +used in place of de-normalization ones for conditioning the image generation. +Our model inherits the advantages of both solutions, retaining state-of-the-art +reconstruction quality, as well as improved global and local style transfer. +Code and models available at https://github.com/TFonta/CA2SIS. + +
+
+
+
+
+ + ☆ From Pixels to Portraits: A Comprehensive Survey of Talking Head + Generation Techniques and Applications + + +
+ Recent advancements in deep learning and computer vision have led to a surge +of interest in generating realistic talking heads. This paper presents a +comprehensive survey of state-of-the-art methods for talking head generation. +We systematically categorises them into four main approaches: image-driven, +audio-driven, video-driven and others (including neural radiance fields (NeRF), +and 3D-based methods). We provide an in-depth analysis of each method, +highlighting their unique contributions, strengths, and limitations. +Furthermore, we thoroughly compare publicly available models, evaluating them +on key aspects such as inference time and human-rated quality of the generated +outputs. Our aim is to provide a clear and concise overview of the current +landscape in talking head generation, elucidating the relationships between +different approaches and identifying promising directions for future research. +This survey will serve as a valuable reference for researchers and +practitioners interested in this rapidly evolving field. + +
+
+
+
+
+ + ☆ Topology-aware MLP for Skeleton-based Action Recognition + + +
+ Graph convolution networks (GCNs) have achieved remarkable performance in +skeleton-based action recognition. However, existing previous GCN-based methods +have relied excessively on elaborate human body priors and constructed complex +feature aggregation mechanisms, which limits the generalizability of networks. +To solve these problems, we propose a novel Spatial Topology Gating Unit +(STGU), which is an MLP-based variant without extra priors, to capture the +co-occurrence topology features that encode the spatial dependency across all +joints. In STGU, to model the sample-specific and completely independent +point-wise topology attention, a new gate-based feature interaction mechanism +is introduced to activate the features point-to-point by the attention map +generated from the input. Based on the STGU, in this work, we propose the first +topology-aware MLP-based model, Ta-MLP, for skeleton-based action recognition. +In comparison with existing previous methods on three large-scale datasets, +Ta-MLP achieves competitive performance. In addition, Ta-MLP reduces the +parameters by up to 62.5% with favorable results. Compared with previous +state-of-the-art (SOAT) approaches, Ta-MLP pushes the frontier of real-time +action recognition. The code will be available at +https://github.com/BUPTSJZhang/Ta-MLP. + +
+
+
+
+
+ + ☆ DTrOCR: Decoder-only Transformer for Optical Character Recognition WACV2024 + + +
+ Typical text recognition methods rely on an encoder-decoder structure, in +which the encoder extracts features from an image, and the decoder produces +recognized text from these features. In this study, we propose a simpler and +more effective method for text recognition, known as the Decoder-only +Transformer for Optical Character Recognition (DTrOCR). This method uses a +decoder-only Transformer to take advantage of a generative language model that +is pre-trained on a large corpus. We examined whether a generative language +model that has been successful in natural language processing can also be +effective for text recognition in computer vision. Our experiments demonstrated +that DTrOCR outperforms current state-of-the-art methods by a large margin in +the recognition of printed, handwritten, and scene text in both English and +Chinese. + +
+
+ comment: Accepted to WACV2024 +
+
+
+
+
+ + ☆ DiffuVolume: Diffusion Model for Volume based Stereo Matching + + +
+ Stereo matching is a significant part in many computer vision tasks and +driving-based applications. Recently cost volume-based methods have achieved +great success benefiting from the rich geometry information in paired images. +However, the redundancy of cost volume also interferes with the model training +and limits the performance. To construct a more precise cost volume, we +pioneeringly apply the diffusion model to stereo matching. Our method, termed +DiffuVolume, considers the diffusion model as a cost volume filter, which will +recurrently remove the redundant information from the cost volume. Two main +designs make our method not trivial. Firstly, to make the diffusion model more +adaptive to stereo matching, we eschew the traditional manner of directly +adding noise into the image but embed the diffusion model into a task-specific +module. In this way, we outperform the traditional diffusion stereo matching +method by 22% EPE improvement and 240 times inference acceleration. Secondly, +DiffuVolume can be easily embedded into any volume-based stereo matching +network with boost performance but slight parameters rise (only 2%). By adding +the DiffuVolume into well-performed methods, we outperform all the published +methods on Scene Flow, KITTI2012, KITTI2015 benchmarks and zero-shot +generalization setting. It is worth mentioning that the proposed model ranks +1st on KITTI 2012 leader board, 2nd on KITTI 2015 leader board since 15, July +2023. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ☆ Learning Structure-from-Motion with Graph Attention Networks + + +
+ In this paper we tackle the problem of learning Structure-from-Motion (SfM) +through the use of graph attention networks. SfM is a classic computer vision +problem that is solved though iterative minimization of reprojection errors, +referred to as Bundle Adjustment (BA), starting from a good initialization. In +order to obtain a good enough initialization to BA, conventional methods rely +on a sequence of sub-problems (such as pairwise pose estimation, pose averaging +or triangulation) which provides an initial solution that can then be refined +using BA. In this work we replace these sub-problems by learning a model that +takes as input the 2D keypoints detected across multiple views, and outputs the +corresponding camera poses and 3D keypoint coordinates. Our model takes +advantage of graph neural networks to learn SfM-specific primitives, and we +show that it can be used for fast inference of the reconstruction for new and +unseen sequences. The experimental results show that the proposed model +outperforms competing learning-based methods, and challenges COLMAP while +having lower runtime. + +
+
+
+
+
+ + ☆ RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation + + +
+ For robots to be useful outside labs and specialized factories we need a way +to teach them new useful behaviors quickly. Current approaches lack either the +generality to onboard new tasks without task-specific engineering, or else lack +the data-efficiency to do so in an amount of time that enables practical use. +In this work we explore dense tracking as a representational vehicle to allow +faster and more general learning from demonstration. Our approach utilizes +Track-Any-Point (TAP) models to isolate the relevant motion in a demonstration, +and parameterize a low-level controller to reproduce this motion across changes +in the scene configuration. We show this results in robust robot policies that +can solve complex object-arrangement tasks such as shape-matching, stacking, +and even full path-following tasks such as applying glue and sticking objects +together, all from demonstrations that can be collected in minutes. + +
+
+ comment: Project website: https://robotap.github.io +
+
+
+
+
+ + ☆ SHARP Challenge 2023: Solving CAD History and pArameters Recovery from + Point clouds and 3D scans. Overview, Datasets, Metrics, and Baselines + + +
+ Recent breakthroughs in geometric Deep Learning (DL) and the availability of +large Computer-Aided Design (CAD) datasets have advanced the research on +learning CAD modeling processes and relating them to real objects. In this +context, 3D reverse engineering of CAD models from 3D scans is considered to be +one of the most sought-after goals for the CAD industry. However, recent +efforts assume multiple simplifications limiting the applications in real-world +settings. The SHARP Challenge 2023 aims at pushing the research a step closer +to the real-world scenario of CAD reverse engineering through dedicated +datasets and tracks. In this paper, we define the proposed SHARP 2023 tracks, +describe the provided datasets, and propose a set of baseline methods along +with suitable evaluation metrics to assess the performance of the track +solutions. All proposed datasets along with useful routines and the evaluation +metrics are publicly available. + +
+
+
+
+
+ + ☆ Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting + + +
+ The task of radiology reporting comprises describing and interpreting the +medical findings in radiographic images, including description of their +location and appearance. Automated approaches to radiology reporting require +the image to be encoded into a suitable token representation for input to the +language model. Previous methods commonly use convolutional neural networks to +encode an image into a series of image-level feature map representations. +However, the generated reports often exhibit realistic style but imperfect +accuracy. Inspired by recent works for image captioning in the general domain +in which each visual token corresponds to an object detected in an image, we +investigate whether using local tokens corresponding to anatomical structures +can improve the quality of the generated reports. We introduce a novel +adaptation of Faster R-CNN in which finding detection is performed for the +candidate bounding boxes extracted during anatomical structure localisation. We +use the resulting bounding box feature representations as our set of +finding-aware anatomical tokens. This encourages the extracted anatomical +tokens to be informative about the findings they contain (required for the +final task of radiology reporting). Evaluating on the MIMIC-CXR dataset of +chest X-Ray images, we show that task-aware anatomical tokens give +state-of-the-art performance when integrated into an automated reporting +pipeline, yielding generated reports with improved clinical accuracy. + +
+
+
+
+
+ + ☆ Fusing Pseudo Labels with Weak Supervision for Dynamic Traffic Scenarios ICCV + + +
+ Advanced Driver Assistance Systems (ADAS) have made significant strides, +capitalizing on computer vision to enhance perception and decision-making +capabilities. Nonetheless, the adaptation of these systems to diverse traffic +scenarios poses challenges due to shifts in data distribution stemming from +factors such as location, weather, and road infrastructure. To tackle this, we +introduce a weakly-supervised label unification pipeline that amalgamates +pseudo labels from a multitude of object detection models trained on +heterogeneous datasets. Our pipeline engenders a unified label space through +the amalgamation of labels from disparate datasets, rectifying bias and +enhancing generalization. We fine-tune multiple object detection models on +individual datasets, subsequently crafting a unified dataset featuring pseudo +labels, meticulously validated for precision. Following this, we retrain a +solitary object detection model using the merged label space, culminating in a +resilient model proficient in dynamic traffic scenarios. We put forth a +comprehensive evaluation of our approach, employing diverse datasets +originating from varied Asian countries, effectively demonstrating its efficacy +in challenging road conditions. Notably, our method yields substantial +enhancements in object detection performance, culminating in a model with +heightened resistance against domain shifts. + +
+
+ comment: This work was accepted as an extended abstract at the International + Conference on Computer Vision (ICCV) 2023 BRAVO Workshop, Paris, France +
+
+
+
+
+ + ☆ Latency-aware Unified Dynamic Networks for Efficient Image Recognition + + +
+ Dynamic computation has emerged as a promising avenue to enhance the +inference efficiency of deep networks. It allows selective activation of +computational units, leading to a reduction in unnecessary computations for +each input sample. However, the actual efficiency of these dynamic models can +deviate from theoretical predictions. This mismatch arises from: 1) the lack of +a unified approach due to fragmented research; 2) the focus on algorithm design +over critical scheduling strategies, especially in CUDA-enabled GPU contexts; +and 3) challenges in measuring practical latency, given that most libraries +cater to static operations. Addressing these issues, we unveil the +Latency-Aware Unified Dynamic Networks (LAUDNet), a framework that integrates +three primary dynamic paradigms-spatially adaptive computation, dynamic layer +skipping, and dynamic channel skipping. To bridge the theoretical and practical +efficiency gap, LAUDNet merges algorithmic design with scheduling optimization, +guided by a latency predictor that accurately gauges dynamic operator latency. +We've tested LAUDNet across multiple vision tasks, demonstrating its capacity +to notably reduce the latency of models like ResNet-101 by over 50% on +platforms such as V100, RTX3090, and TX2 GPUs. Notably, LAUDNet stands out in +balancing accuracy and efficiency. Code is available at: +https://www.github.com/LeapLabTHU/LAUDNet. + +
+
+
+
+
+ + ☆ Stage-by-stage Wavelet Optimization Refinement Diffusion Model for + Sparse-View CT Reconstruction + + +
+ Diffusion models have emerged as potential tools to tackle the challenge of +sparse-view CT reconstruction, displaying superior performance compared to +conventional methods. Nevertheless, these prevailing diffusion models +predominantly focus on the sinogram or image domains, which can lead to +instability during model training, potentially culminating in convergence +towards local minimal solutions. The wavelet trans-form serves to disentangle +image contents and features into distinct frequency-component bands at varying +scales, adeptly capturing diverse directional structures. Employing the Wavelet +transform as a guiding sparsity prior significantly enhances the robustness of +diffusion models. In this study, we present an innovative approach named the +Stage-by-stage Wavelet Optimization Refinement Diffusion (SWORD) model for +sparse-view CT reconstruction. Specifically, we establish a unified +mathematical model integrating low-frequency and high-frequency generative +models, achieving the solution with optimization procedure. Furthermore, we +perform the low-frequency and high-frequency generative models on wavelet's +decomposed components rather than sinogram or image domains, ensuring the +stability of model training. Our method rooted in established optimization +theory, comprising three distinct stages, including low-frequency generation, +high-frequency refinement and domain transform. Our experimental results +demonstrate that the proposed method outperforms existing state-of-the-art +methods both quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ AnoVL: Adapting Vision-Language Models for Unified Zero-shot Anomaly + Localization + + +
+ Contrastive Language-Image Pre-training (CLIP) models have shown promising +performance on zero-shot visual recognition tasks by learning visual +representations under natural language supervision. Recent studies attempt the +use of CLIP to tackle zero-shot anomaly detection by matching images with +normal and abnormal state prompts. However, since CLIP focuses on building +correspondence between paired text prompts and global image-level +representations, the lack of patch-level vision to text alignment limits its +capability on precise visual anomaly localization. In this work, we introduce a +training-free adaptation (TFA) framework of CLIP for zero-shot anomaly +localization. In the visual encoder, we innovate a training-free value-wise +attention mechanism to extract intrinsic local tokens of CLIP for patch-level +local description. From the perspective of text supervision, we particularly +design a unified domain-aware contrastive state prompting template. On top of +the proposed TFA, we further introduce a test-time adaptation (TTA) mechanism +to refine anomaly localization results, where a layer of trainable parameters +in the adapter is optimized using TFA's pseudo-labels and synthetic +noise-corrupted tokens. With both TFA and TTA adaptation, we significantly +exploit the potential of CLIP for zero-shot anomaly localization and +demonstrate the effectiveness of our proposed methods on various datasets. + +
+
+
+
+
+ + ☆ Attention-based CT Scan Interpolation for Lesion Segmentation of + Colorectal Liver Metastases + + +
+ Small liver lesions common to colorectal liver metastases (CRLMs) are +challenging for convolutional neural network (CNN) segmentation models, +especially when we have a wide range of slice thicknesses in the computed +tomography (CT) scans. Slice thickness of CT images may vary by clinical +indication. For example, thinner slices are used for presurgical planning when +fine anatomic details of small vessels are required. While keeping the +effective radiation dose in patients as low as possible, various slice +thicknesses are employed in CRLMs due to their limitations. However, +differences in slice thickness across CTs lead to significant performance +degradation in CT segmentation models based on CNNs. This paper proposes a +novel unsupervised attention-based interpolation model to generate intermediate +slices from consecutive triplet slices in CT scans. We integrate segmentation +loss during the interpolation model's training to leverage segmentation labels +in existing slices to generate middle ones. Unlike common interpolation +techniques in CT volumes, our model highlights the regions of interest (liver +and lesions) inside the abdominal CT scans in the interpolated slice. Moreover, +our model's outputs are consistent with the original input slices while +increasing the segmentation performance in two cutting-edge 3D segmentation +pipelines. We tested the proposed model on the CRLM dataset to upsample +subjects with thick slices and create isotropic volume for our segmentation +model. The produced isotropic dataset increases the Dice score in the +segmentation of lesions and outperforms other interpolation approaches in terms +of interpolation metrics. + +
+
+
+
+
+ + ☆ Physics-Informed DeepMRI: Bridging the Gap from Heat Diffusion to + k-Space Interpolation + + +
+ In the field of parallel imaging (PI), alongside image-domain regularization +methods, substantial research has been dedicated to exploring $k$-space +interpolation. However, the interpretability of these methods remains an +unresolved issue. Furthermore, these approaches currently face acceleration +limitations that are comparable to those experienced by image-domain methods. +In order to enhance interpretability and overcome the acceleration limitations, +this paper introduces an interpretable framework that unifies both $k$-space +interpolation techniques and image-domain methods, grounded in the physical +principles of heat diffusion equations. Building upon this foundational +framework, a novel $k$-space interpolation method is proposed. Specifically, we +model the process of high-frequency information attenuation in $k$-space as a +heat diffusion equation, while the effort to reconstruct high-frequency +information from low-frequency regions can be conceptualized as a reverse heat +equation. However, solving the reverse heat equation poses a challenging +inverse problem. To tackle this challenge, we modify the heat equation to align +with the principles of magnetic resonance PI physics and employ the score-based +generative method to precisely execute the modified reverse heat diffusion. +Finally, experimental validation conducted on publicly available datasets +demonstrates the superiority of the proposed approach over traditional +$k$-space interpolation methods, deep learning-based $k$-space interpolation +methods, and conventional diffusion models in terms of reconstruction accuracy, +particularly in high-frequency regions. + +
+
+
+
+
+ + ☆ On the Potential of CLIP for Compositional Logical Reasoning + + +
+ In this paper we explore the possibility of using OpenAI's CLIP to perform +logically coherent grounded visual reasoning. To that end, we formalize our +terms and give a geometric analysis of how embeddings in CLIP's latent space +would need to be configured in order for the system to be logically coherent. +Our main conclusion is that, as usually configured, CLIP cannot perform such +reasoning. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Interpretability-guided Data Augmentation for Robust Segmentation in + Multi-centre Colonoscopy Data MICCAI 2023 + + +
+ Multi-centre colonoscopy images from various medical centres exhibit distinct +complicating factors and overlays that impact the image content, contingent on +the specific acquisition centre. Existing Deep Segmentation networks struggle +to achieve adequate generalizability in such data sets, and the currently +available data augmentation methods do not effectively address these sources of +data variability. As a solution, we introduce an innovative data augmentation +approach centred on interpretability saliency maps, aimed at enhancing the +generalizability of Deep Learning models within the realm of multi-centre +colonoscopy image segmentation. The proposed augmentation technique +demonstrates increased robustness across different segmentation models and +domains. Thorough testing on a publicly available multi-centre dataset for +polyp detection demonstrates the effectiveness and versatility of our approach, +which is observed both in quantitative and qualitative results. The code is +publicly available at: +https://github.com/nki-radiology/interpretability_augmentation + +
+
+ comment: 10 pages, 4 figures, 1 table, accepted at MICCAI 2023 Workshop on + Machine Learning in Medical Imaging (MLMI) +
+
+
+
+
+ + ☆ Feature Attention Network (FA-Net): A Deep-Learning Based Approach for + Underwater Single Image Enhancement + + +
+ Underwater image processing and analysis have been a hotspot of study in +recent years, as more emphasis has been focused to underwater monitoring and +usage of marine resources. Compared with the open environment, underwater image +encountered with more complicated conditions such as light abortion, +scattering, turbulence, nonuniform illumination and color diffusion. Although +considerable advances and enhancement techniques achieved in resolving these +issues, they treat low-frequency information equally across the entire channel, +which results in limiting the network's representativeness. We propose a deep +learning and feature-attention-based end-to-end network (FA-Net) to solve this +problem. In particular, we propose a Residual Feature Attention Block (RFAB), +containing the channel attention, pixel attention, and residual learning +mechanism with long and short skip connections. RFAB allows the network to +focus on learning high-frequency information while skipping low-frequency +information on multi-hop connections. The channel and pixel attention mechanism +considers each channel's different features and the uneven distribution of haze +over different pixels in the image. The experimental results shows that the +FA-Net propose by us provides higher accuracy, quantitatively and qualitatively +and superiority to previous state-of-the-art methods. + +
+
+ comment: Fourteenth International Conference on Digital Image Processing + (ICDIP 2022), 2022, Wuhan, China, May 20-23, 2022.8 pages.5 Figures.doi: + 10.1117/12.2644516 +
+
+
+
+
+ + ☆ Semi-supervised Domain Adaptation with Inter and Intra-domain Mixing for + Semantic Segmentation + + +
+ Despite recent advances in semantic segmentation, an inevitable challenge is +the performance degradation caused by the domain shift in real application. +Current dominant approach to solve this problem is unsupervised domain +adaptation (UDA). However, the absence of labeled target data in UDA is overly +restrictive and limits performance. To overcome this limitation, a more +practical scenario called semi-supervised domain adaptation (SSDA) has been +proposed. Existing SSDA methods are derived from the UDA paradigm and primarily +focus on leveraging the unlabeled target data and source data. In this paper, +we highlight the significance of exploiting the intra-domain information +between the limited labeled target data and unlabeled target data, as it +greatly benefits domain adaptation. Instead of solely using the scarce labeled +data for supervision, we propose a novel SSDA framework that incorporates both +inter-domain mixing and intra-domain mixing, where inter-domain mixing +mitigates the source-target domain gap and intra-domain mixing enriches the +available target domain information. By simultaneously learning from +inter-domain mixing and intra-domain mixing, the network can capture more +domain-invariant features and promote its performance on the target domain. We +also explore different domain mixing operations to better exploit the target +domain information. Comprehensive experiments conducted on the GTA5toCityscapes +and SYNTHIA2Cityscapes benchmarks demonstrate the effectiveness of our method, +surpassing previous methods by a large margin. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Zero-shot Inversion Process for Image Attribute Editing with Diffusion + Models + + +
+ Denoising diffusion models have shown outstanding performance in image +editing. Existing works tend to use either image-guided methods, which provide +a visual reference but lack control over semantic coherence, or text-guided +methods, which ensure faithfulness to text guidance but lack visual quality. To +address the problem, we propose the Zero-shot Inversion Process (ZIP), a +framework that injects a fusion of generated visual reference and text guidance +into the semantic latent space of a \textit{frozen} pre-trained diffusion +model. Only using a tiny neural network, the proposed ZIP produces diverse +content and attributes under the intuitive control of the text prompt. +Moreover, ZIP shows remarkable robustness for both in-domain and out-of-domain +attribute manipulation on real images. We perform detailed experiments on +various benchmark datasets. Compared to state-of-the-art methods, ZIP produces +images of equivalent quality while providing a realistic editing effect. + +
+
+
+
+
+ + ☆ Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object + Detection + + +
+ In this paper, we for the first time explore helpful multi-modal contextual +knowledge to understand novel categories for open-vocabulary object detection +(OVD). The multi-modal contextual knowledge stands for the joint relationship +across regions and words. However, it is challenging to incorporate such +multi-modal contextual knowledge into OVD. The reason is that previous +detection frameworks fail to jointly model multi-modal contextual knowledge, as +object detectors only support vision inputs and no caption description is +provided at test time. To this end, we propose a multi-modal contextual +knowledge distillation framework, MMC-Det, to transfer the learned contextual +knowledge from a teacher fusion transformer with diverse multi-modal masked +language modeling (D-MLM) to a student detector. The diverse multi-modal masked +language modeling is realized by an object divergence constraint upon +traditional multi-modal masked language modeling (MLM), in order to extract +fine-grained region-level visual contexts, which are vital to object detection. +Extensive experiments performed upon various detection datasets show the +effectiveness of our multi-modal context learning strategy, where our approach +well outperforms the recent state-of-the-art methods. + +
+
+
+
+
+ + ☆ Reconstructing Groups of People with Hypergraph Relational Reasoning ICCV2023 + + +
+ Due to the mutual occlusion, severe scale variation, and complex spatial +distribution, the current multi-person mesh recovery methods cannot produce +accurate absolute body poses and shapes in large-scale crowded scenes. To +address the obstacles, we fully exploit crowd features for reconstructing +groups of people from a monocular image. A novel hypergraph relational +reasoning network is proposed to formulate the complex and high-order relation +correlations among individuals and groups in the crowd. We first extract +compact human features and location information from the original +high-resolution image. By conducting the relational reasoning on the extracted +individual features, the underlying crowd collectiveness and interaction +relationship can provide additional group information for the reconstruction. +Finally, the updated individual features and the localization information are +used to regress human meshes in camera coordinates. To facilitate the network +training, we further build pseudo ground-truth on two crowd datasets, which may +also promote future research on pose estimation and human behavior +understanding in crowded scenes. The experimental results show that our +approach outperforms other baseline methods both in crowded and common +scenarios. The code and datasets are publicly available at +https://github.com/boycehbz/GroupRec. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Utilizing Task-Generic Motion Prior to Recover Full-Body Motion from + Very Sparse Signals + + +
+ The most popular type of devices used to track a user's posture in a virtual +reality experience consists of a head-mounted display and two controllers held +in both hands. However, due to the limited number of tracking sensors (three in +total), faithfully recovering the user in full-body is challenging, limiting +the potential for interactions among simulated user avatars within the virtual +world. Therefore, recent studies have attempted to reconstruct full-body poses +using neural networks that utilize previously learned human poses or accept a +series of past poses over a short period. In this paper, we propose a method +that utilizes information from a neural motion prior to improve the accuracy of +reconstructed user's motions. Our approach aims to reconstruct user's full-body +poses by predicting the latent representation of the user's overall motion from +limited input signals and integrating this information with tracking sensor +inputs. This is based on the premise that the ultimate goal of pose +reconstruction is to reconstruct the motion, which is a series of poses. Our +results show that this integration enables more accurate reconstruction of the +user's full-body motion, particularly enhancing the robustness of lower body +motion reconstruction from impoverished signals. Web: +https://https://mjsh34.github.io/mp-sspe/ + +
+
+
+
+
+ + ☆ Early Detection of Red Palm Weevil Infestations using Deep Learning + Classification of Acoustic Signals + + +
+ The Red Palm Weevil (RPW), also known as the palm weevil, is considered among +the world's most damaging insect pests of palms. Current detection techniques +include the detection of symptoms of RPW using visual or sound inspection and +chemical detection of volatile signatures generated by infested palm trees. +However, efficient detection of RPW diseases at an early stage is considered +one of the most challenging issues for cultivating date palms. In this paper, +an efficient approach to the early detection of RPW is proposed. The proposed +approach is based on RPW sound activities being recorded and analyzed. The +first step involves the conversion of sound data into images based on a +selected set of features. The second step involves the combination of images +from the same sound file but computed by different features into a single +image. The third step involves the application of different Deep Learning (DL) +techniques to classify resulting images into two classes: infested and not +infested. Experimental results show good performances of the proposed approach +for RPW detection using different DL techniques, namely MobileNetV2, +ResNet50V2, ResNet152V2, VGG16, VGG19, DenseNet121, DenseNet201, Xception, and +InceptionV3. The proposed approach outperformed existing techniques for public +datasets. + +
+
+
+
+
+ + ☆ Introducing Language Guidance in Prompt-based Continual Learning ICCV 2023 + + +
+ Continual Learning aims to learn a single model on a sequence of tasks +without having access to data from previous tasks. The biggest challenge in the +domain still remains catastrophic forgetting: a loss in performance on seen +classes of earlier tasks. Some existing methods rely on an expensive replay +buffer to store a chunk of data from previous tasks. This, while promising, +becomes expensive when the number of tasks becomes large or data can not be +stored for privacy reasons. As an alternative, prompt-based methods have been +proposed that store the task information in a learnable prompt pool. This +prompt pool instructs a frozen image encoder on how to solve each task. While +the model faces a disjoint set of classes in each task in this setting, we +argue that these classes can be encoded to the same embedding space of a +pre-trained language encoder. In this work, we propose Language Guidance for +Prompt-based Continual Learning (LGCL) as a plug-in for prompt-based methods. +LGCL is model agnostic and introduces language guidance at the task level in +the prompt pool and at the class level on the output feature of the vision +encoder. We show with extensive experimentation that LGCL consistently improves +the performance of prompt-based continual learning methods to set a new +state-of-the art. LGCL achieves these performance improvements without needing +any additional learnable parameters. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ AMDNet23: A combined deep Contour-based Convolutional Neural Network and + Long Short Term Memory system to diagnose Age-related Macular Degeneration + + +
+ In light of the expanding population, an automated framework of disease +detection can assist doctors in the diagnosis of ocular diseases, yields +accurate, stable, rapid outcomes, and improves the success rate of early +detection. The work initially intended the enhancing the quality of fundus +images by employing an adaptive contrast enhancement algorithm (CLAHE) and +Gamma correction. In the preprocessing techniques, CLAHE elevates the local +contrast of the fundus image and gamma correction increases the intensity of +relevant features. This study operates on a AMDNet23 system of deep learning +that combined the neural networks made up of convolutions (CNN) and short-term +and long-term memory (LSTM) to automatically detect aged macular degeneration +(AMD) disease from fundus ophthalmology. In this mechanism, CNN is utilized for +extracting features and LSTM is utilized to detect the extracted features. The +dataset of this research is collected from multiple sources and afterward +applied quality assessment techniques, 2000 experimental fundus images +encompass four distinct classes equitably. The proposed hybrid deep AMDNet23 +model demonstrates to detection of AMD ocular disease and the experimental +result achieved an accuracy 96.50%, specificity 99.32%, sensitivity 96.5%, and +F1-score 96.49.0%. The system achieves state-of-the-art findings on fundus +imagery datasets to diagnose AMD ocular disease and findings effectively +potential of our method. + +
+
+
+
+
+ + ☆ Improving Underwater Visual Tracking With a Large Scale Dataset and + Image Enhancement + + +
+ This paper presents a new dataset and general tracker enhancement method for +Underwater Visual Object Tracking (UVOT). Despite its significance, underwater +tracking has remained unexplored due to data inaccessibility. It poses distinct +challenges; the underwater environment exhibits non-uniform lighting +conditions, low visibility, lack of sharpness, low contrast, camouflage, and +reflections from suspended particles. Performance of traditional tracking +methods designed primarily for terrestrial or open-air scenarios drops in such +conditions. We address the problem by proposing a novel underwater image +enhancement algorithm designed specifically to boost tracking quality. The +method has resulted in a significant performance improvement, of up to 5.0% +AUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate +UVOT methods, large-scale datasets are required. To this end, we introduce a +large-scale UVOT benchmark dataset consisting of 400 video segments and 275,000 +manually annotated frames enabling underwater training and evaluation of deep +trackers. The videos are labelled with several underwater-specific tracking +attributes including watercolor variation, target distractors, camouflage, +target relative size, and low visibility conditions. The UVOT400 dataset, +tracking results, and the code are publicly available on: +https://github.com/BasitAlawode/UWVOT400. + +
+
+
+
+
+ + ☆ ACNPU: A 4.75TOPS/W 1080P@30FPS Super Resolution Accelerator with + Decoupled Asymmetric Convolution + + +
+ Deep learning-driven superresolution (SR) outperforms traditional techniques +but also faces the challenge of high complexity and memory bandwidth. This +challenge leads many accelerators to opt for simpler and shallow models like +FSRCNN, compromising performance for real-time needs, especially for +resource-limited edge devices. This paper proposes an energy-efficient SR +accelerator, ACNPU, to tackle this challenge. The ACNPU enhances image quality +by 0.34dB with a 27-layer model, but needs 36\% less complexity than FSRCNN, +while maintaining a similar model size, with the \textit{decoupled asymmetric +convolution and split-bypass structure}. The hardware-friendly 17K-parameter +model enables \textit{holistic model fusion} instead of localized layer fusion +to remove external DRAM access of intermediate feature maps. The on-chip memory +bandwidth is further reduced with the \textit{input stationary flow} and +\textit{parallel-layer execution} to reduce power consumption. Hardware is +regular and easy to control to support different layers by \textit{processing +elements (PEs) clusters with reconfigurable input and uniform data flow}. The +implementation in the 40 nm CMOS process consumes 2333 K gate counts and 198KB +SRAMs. The ACNPU achieves 31.7 FPS and 124.4 FPS for x2 and x4 scales Full-HD +generation, respectively, which attains 4.75 TOPS/W energy efficiency. + +
+
+ comment: 9 pages, 14 figures +
+
+
+
+
+ + ☆ Occlusion-Aware Detection and Re-ID Calibrated Network for Multi-Object + Tracking + + +
+ Multi-Object Tracking (MOT) is a crucial computer vision task that aims to +predict the bounding boxes and identities of objects simultaneously. While +state-of-the-art methods have made remarkable progress by jointly optimizing +the multi-task problems of detection and Re-ID feature learning, yet, few +approaches explore to tackle the occlusion issue, which is a long-standing +challenge in the MOT field. Generally, occluded objects may hinder the detector +from estimating the bounding boxes, resulting in fragmented trajectories. And +the learned occluded Re-ID embeddings are less distinct since they contain +interferer. To this end, we propose an occlusion-aware detection and Re-ID +calibrated network for multi-object tracking, termed as ORCTrack. Specifically, +we propose an Occlusion-Aware Attention (OAA) module in the detector that +highlights the object features while suppressing the occluded background +regions. OAA can serve as a modulator that enhances the detector for some +potentially occluded objects. Furthermore, we design a Re-ID embedding matching +block based on the optimal transport problem, which focuses on enhancing and +calibrating the Re-ID representations through different adjacent frames +complementarily. To validate the effectiveness of the proposed method, +extensive experiments are conducted on two challenging VisDrone2021-MOT and +KITTI benchmarks. Experimental evaluations demonstrate the superiority of our +approach, which can achieve new state-of-the-art performance and enjoy high +run-time efficiency. + +
+
+
+
+
+ + ☆ Neural Video Compression with Temporal Layer-Adaptive Hierarchical + B-frame Coding + + +
+ Neural video compression (NVC) is a rapidly evolving video coding research +area, with some models achieving superior coding efficiency compared to the +latest video coding standard Versatile Video Coding (VVC). In conventional +video coding standards, the hierarchical B-frame coding, which utilizes a +bidirectional prediction structure for higher compression, had been +well-studied and exploited. In NVC, however, limited research has investigated +the hierarchical B scheme. In this paper, we propose an NVC model exploiting +hierarchical B-frame coding with temporal layer-adaptive optimization. We first +extend an existing unidirectional NVC model to a bidirectional model, which +achieves -21.13% BD-rate gain over the unidirectional baseline model. However, +this model faces challenges when applied to sequences with complex or large +motions, leading to performance degradation. To address this, we introduce +temporal layer-adaptive optimization, incorporating methods such as temporal +layer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent +scaling (TALS). The final model with the proposed methods achieves an +impressive BD-rate gain of -39.86% against the baseline. It also resolves the +challenges in sequences with large or complex motions with up to -49.13% more +BD-rate gains than the simple bidirectional extension. This improvement is +attributed to the allocation of more bits to lower temporal layers, thereby +enhancing overall reconstruction quality with smaller bits. Since our method +has little dependency on a specific NVC model architecture, it can serve as a +general tool for extending unidirectional NVC models to the ones with +hierarchical B-frame coding. + +
+
+
+
+
+ + ☆ Large-scale data extraction from the UNOS organ donor documents + + +
+ The scope of our study is all UNOS data of the USA organ donors since 2008. +The data is not analyzable in a large scale in the past because it was captured +in PDF documents known as "Attachments", whereby every donor is represented by +dozens of PDF documents in heterogenous formats. To make the data analyzable, +one needs to convert the content inside these PDFs to an analyzable data +format, such as a standard SQL database. In this paper we will focus on 2022 +UNOS data comprised of $\approx 400,000$ PDF documents spanning millions of +pages. The totality of UNOS data covers 15 years (2008--20022) and our results +will be quickly extended to the entire data. Our method captures a portion of +the data in DCD flowsheets, kidney perfusion data, and data captured during +patient hospital stay (e.g. vital signs, ventilator settings, etc.). The +current paper assumes that the reader is familiar with the content of the UNOS +data. The overview of the types of data and challenges they present is a +subject of another paper. Here we focus on demonstrating that the goal of +building a comprehensive, analyzable database from UNOS documents is an +attainable task, and we provide an overview of our methodology. The project +resulted in datasets by far larger than previously available even in this +preliminary phase. + +
+
+
+
+
+ + ☆ Beard Segmentation and Recognition Bias + + +
+ A person's facial hairstyle, such as presence and size of beard, can +significantly impact face recognition accuracy. There are publicly-available +deep networks that achieve reasonable accuracy at binary attribute +classification, such as beard / no beard, but few if any that segment the +facial hair region. To investigate the effect of facial hair in a rigorous +manner, we first created a set of fine-grained facial hair annotations to train +a segmentation model and evaluate its accuracy across African-American and +Caucasian face images. We then use our facial hair segmentations to categorize +image pairs according to the degree of difference or similarity in the facial +hairstyle. We find that the False Match Rate (FMR) for image pairs with +different categories of facial hairstyle varies by a factor of over 10 for +African-American males and over 25 for Caucasian males. To reduce the bias +across image pairs with different facial hairstyles, we propose a scheme for +adaptive thresholding based on facial hairstyle similarity. Evaluation on a +subject-disjoint set of images shows that adaptive similarity thresholding +based on facial hairstyles of the image pair reduces the ratio between the +highest and lowest FMR across facial hairstyle categories for African-American +from 10.7 to 1.8 and for Caucasians from 25.9 to 1.3. Facial hair annotations +and facial hair segmentation model will be publicly available. + +
+
+
+
+
+ + ☆ Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale + Drone Survey + + +
+ Neural rendering has garnered substantial attention owing to its capacity for +creating realistic 3D scenes. However, its applicability to extensive scenes +remains challenging, with limitations in effectiveness. In this work, we +propose the Drone-NeRF framework to enhance the efficient reconstruction of +unbounded large-scale scenes suited for drone oblique photography using Neural +Radiance Fields (NeRF). Our approach involves dividing the scene into uniform +sub-blocks based on camera position and depth visibility. Sub-scenes are +trained in parallel using NeRF, then merged for a complete scene. We refine the +model by optimizing camera poses and guiding NeRF with a uniform sampler. +Integrating chosen samples enhances accuracy. A hash-coded fusion MLP +accelerates density representation, yielding RGB and Depth outputs. Our +framework accounts for sub-scene constraints, reduces parallel-training noise, +handles shadow occlusion, and merges sub-regions for a polished rendering +result. This Drone-NeRF framework demonstrates promising capabilities in +addressing challenges related to scene complexity, rendering efficiency, and +accuracy in drone-obtained imagery. + +
+
+ comment: 15 pages, 7 figures, in submission +
+
+
+
+
+ + ☆ Background Debiased SAR Target Recognition via Causal Interventional + Regularizer + + +
+ Recent studies have utilized deep learning (DL) techniques to automatically +extract features from synthetic aperture radar (SAR) images, which shows great +promise for enhancing the performance of SAR automatic target recognition +(ATR). However, our research reveals a previously overlooked issue: SAR images +to be recognized include not only the foreground (i.e., the target), but also a +certain size of the background area. When a DL-model is trained exclusively on +foreground data, its recognition performance is significantly superior to a +model trained on original data that includes both foreground and background. +This suggests that the presence of background impedes the ability of the +DL-model to learn additional semantic information about the target. To address +this issue, we construct a structural causal model (SCM) that incorporates the +background as a confounder. Based on the constructed SCM, we propose a causal +intervention based regularization method to eliminate the negative impact of +background on feature semantic learning and achieve background debiased +SAR-ATR. The proposed causal interventional regularizer can be integrated into +any existing DL-based SAR-ATR models to mitigate the impact of background +interference on the feature extraction and recognition accuracy. Experimental +results on the Moving and Stationary Target Acquisition and Recognition (MSTAR) +dataset indicate that the proposed method can enhance the efficiency of +existing DL-based methods in a plug-and-play manner. + +
+
+ comment: 38 pages, 8 figures +
+
+
+
+
+ + ☆ Towards Earlier Detection of Oral Diseases On Smartphones Using Oral and + Dental RGB Images + + +
+ Oral diseases such as periodontal (gum) diseases and dental caries (cavities) +affect billions of people across the world today. However, previous +state-of-the-art models have relied on X-ray images to detect oral diseases, +making them inaccessible to remote monitoring, developing countries, and +telemedicine. To combat this overuse of X-ray imagery, we propose a lightweight +machine learning model capable of detecting calculus (also known as hardened +plaque or tartar) in RGB images while running efficiently on low-end devices. +The model, a modified MobileNetV3-Small neural network transfer learned from +ImageNet, achieved an accuracy of 72.73% (which is comparable to +state-of-the-art solutions) while still being able to run on mobile devices due +to its reduced memory requirements and processing times. A ResNet34-based model +was also constructed and achieved an accuracy of 81.82%. Both of these models +were tested on a mobile app, demonstrating their potential to limit the number +of serious oral disease cases as their predictions can help patients schedule +appointments earlier without the need to go to the clinic. + +
+
+ comment: 10 pages, 6 figures, 1 formula. This research was conducted as a + mentored project performed for a college course and research program at the + University of California Santa Barbara's Summer Research Academies program +
+
+
+
+
+ + ☆ Intriguing Properties of Diffusion Models: A Large-Scale Dataset for + Evaluating Natural Attack Capability in Text-to-Image Generative Models + + +
+ Denoising probabilistic diffusion models have shown breakthrough performance +that can generate more photo-realistic images or human-level illustrations than +the prior models such as GANs. This high image-generation capability has +stimulated the creation of many downstream applications in various areas. +However, we find that this technology is indeed a double-edged sword: We +identify a new type of attack, called the Natural Denoising Diffusion (NDD) +attack based on the finding that state-of-the-art deep neural network (DNN) +models still hold their prediction even if we intentionally remove their robust +features, which are essential to the human visual system (HVS), by text +prompts. The NDD attack can generate low-cost, model-agnostic, and +transferrable adversarial attacks by exploiting the natural attack capability +in diffusion models. Motivated by the finding, we construct a large-scale +dataset, Natural Denoising Diffusion Attack (NDDA) dataset, to systematically +evaluate the risk of the natural attack capability of diffusion models with +state-of-the-art text-to-image diffusion models. We evaluate the natural attack +capability by answering 6 research questions. Through a user study to confirm +the validity of the NDD attack, we find that the NDD attack can achieve an 88% +detection rate while being stealthy to 93% of human subjects. We also find that +the non-robust features embedded by diffusion models contribute to the natural +attack capability. To confirm the model-agnostic and transferrable attack +capability, we perform the NDD attack against an AD vehicle and find that 73% +of the physically printed attacks can be detected as a stop sign. We hope that +our study and dataset can help our community to be aware of the risk of +diffusion models and facilitate further research toward robust DNN models. + +
+
+
+
+
+ + ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ☆ A Recycling Training Strategy for Medical Image Segmentation with + Diffusion Denoising Models + + +
+ Denoising diffusion models have found applications in image segmentation by +generating segmented masks conditioned on images. Existing studies +predominantly focus on adjusting model architecture or improving inference such +as test-time sampling strategies. In this work, we focus on training strategy +improvements and propose a novel recycling method. During each training step, a +segmentation mask is first predicted given an image and a random noise. This +predicted mask, replacing the conventional ground truth mask, is used for +denoising task during training. This approach can be interpreted as aligning +the training strategy with inference by eliminating the dependence on ground +truth masks for generating noisy samples. Our proposed method significantly +outperforms standard diffusion training, self-conditioning, and existing +recycling strategies across multiple medical imaging data sets: muscle +ultrasound, abdominal CT, prostate MR, and brain MR. This holds true for two +widely adopted sampling strategies: denoising diffusion probabilistic model and +denoising diffusion implicit model. Importantly, existing diffusion models +often display a declining or unstable performance during inference, whereas our +novel recycling consistently enhances or maintains performance. Furthermore, we +show for the first time that, under a fair comparison with the same network +architectures and computing budget, the proposed recycling-based diffusion +models achieved on-par performance with non-diffusion-based supervised +training. This paper summarises these quantitative results and discusses their +values, with a fully reproducible JAX-based implementation, released at +https://github.com/mathpluscode/ImgX-DiffSeg. + +
+
+
+
+
+ + ☆ Catalog Phrase Grounding (CPG): Grounding of Product Textual Attributes + in Product Images for e-commerce Vision-Language Applications KDD 2022 + + +
+ We present Catalog Phrase Grounding (CPG), a model that can associate product +textual data (title, brands) into corresponding regions of product images +(isolated product region, brand logo region) for e-commerce vision-language +applications. We use a state-of-the-art modulated multimodal transformer +encoder-decoder architecture unifying object detection and phrase-grounding. We +train the model in self-supervised fashion with 2.3 million image-text pairs +synthesized from an e-commerce site. The self-supervision data is annotated +with high-confidence pseudo-labels generated with a combination of teacher +models: a pre-trained general domain phrase grounding model (e.g. MDETR) and a +specialized logo detection model. This allows CPG, as a student model, to +benefit from transfer knowledge from these base models combining general-domain +knowledge and specialized knowledge. Beyond immediate catalog phrase grounding +tasks, we can benefit from CPG representations by incorporating them as ML +features into downstream catalog applications that require deep semantic +understanding of products. Our experiments on product-brand matching, a +challenging e-commerce application, show that incorporating CPG representations +into the existing production ensemble system leads to on average 5% recall +improvement across all countries globally (with the largest lift of 11% in a +single country) at fixed 95% precision, outperforming other alternatives +including a logo detection teacher model and ResNet50. + +
+
+ comment: KDD 2022 Workshop on First Content Understanding and Generation for + e-Commerce +
+
+
+
+
+ + ☆ Two-Stage Violence Detection Using ViTPose and Classification Models at + Smart Airports + + +
+ This study introduces an innovative violence detection framework tailored to +the unique requirements of smart airports, where prompt responses to violent +situations are crucial. The proposed framework harnesses the power of ViTPose +for human pose estimation. It employs a CNN - BiLSTM network to analyse spatial +and temporal information within keypoints sequences, enabling the accurate +classification of violent behaviour in real time. Seamlessly integrated within +the SAFE (Situational Awareness for Enhanced Security framework of SAAB, the +solution underwent integrated testing to ensure robust performance in real +world scenarios. The AIRTLab dataset, characterized by its high video quality +and relevance to surveillance scenarios, is utilized in this study to enhance +the model's accuracy and mitigate false positives. As airports face increased +foot traffic in the post pandemic era, implementing AI driven violence +detection systems, such as the one proposed, is paramount for improving +security, expediting response times, and promoting data informed decision +making. The implementation of this framework not only diminishes the +probability of violent events but also assists surveillance teams in +effectively addressing potential threats, ultimately fostering a more secure +and protected aviation sector. Codes are available at: +https://github.com/Asami-1/GDP. + +
+
+
+
+
+ + ☆ Software multiplataforma para a segmentação de vasos sanguíneos + em imagens da retina + + +
+ In this work, we utilize image segmentation to visually identify blood +vessels in retinal examination images. This process is typically carried out +manually. However, we can employ heuristic methods and machine learning to +automate or at least expedite the process. In this context, we propose a +cross-platform, open-source, and responsive software that allows users to +manually segment a retinal image. The purpose is to use the user-segmented +image to retrain machine learning algorithms, thereby enhancing future +automated segmentation results. Moreover, the software also incorporates and +applies certain image filters established in the literature to improve vessel +visualization. We propose the first solution of this kind in the literature. +This is the inaugural integrated software that embodies the aforementioned +attributes: open-source, responsive, and cross-platform. It offers a +comprehensive solution encompassing manual vessel segmentation, as well as the +automated execution of classification algorithms to refine predictive models. + +
+
+ comment: in Portuguese language. International Conference on Production + Research - Americas 2022. + https://www.even3.com.br/anais/foreigners_subscription_icpr_americas22/664603-software-multiplataforma-para-a-segmentacao-de-vasos-sanguineos-em-imagens-da-retina/ +
+
+
+
+
+ + ☆ Ten Years of Generative Adversarial Nets (GANs): A survey of the + state-of-the-art + + +
+ Since their inception in 2014, Generative Adversarial Networks (GANs) have +rapidly emerged as powerful tools for generating realistic and diverse data +across various domains, including computer vision and other applied areas. +Consisting of a discriminative network and a generative network engaged in a +Minimax game, GANs have revolutionized the field of generative modeling. In +February 2018, GAN secured the leading spot on the ``Top Ten Global +Breakthrough Technologies List'' issued by the Massachusetts Science and +Technology Review. Over the years, numerous advancements have been proposed, +leading to a rich array of GAN variants, such as conditional GAN, Wasserstein +GAN, CycleGAN, and StyleGAN, among many others. This survey aims to provide a +general overview of GANs, summarizing the latent architecture, validation +metrics, and application areas of the most widely recognized variants. We also +delve into recent theoretical developments, exploring the profound connection +between the adversarial principle underlying GAN and Jensen-Shannon divergence, +while discussing the optimality characteristics of the GAN framework. The +efficiency of GAN variants and their model architectures will be evaluated +along with training obstacles as well as training solutions. In addition, a +detailed discussion will be provided, examining the integration of GANs with +newly developed deep learning frameworks such as Transformers, Physics-Informed +Neural Networks, Large Language models, and Diffusion models. Finally, we +reveal several issues as well as future research outlines in this field. + +
+
+
+
+
+ + ☆ A reinforcement learning based construction material supply strategy + using robotic crane and computer vision for building reconstruction after an + earthquake + + +
+ After an earthquake, it is particularly important to provide the necessary +resources on site because a large number of infrastructures need to be repaired +or newly constructed. Due to the complex construction environment after the +disaster, there are potential safety hazards for human labors working in this +environment. With the advancement of robotic technology and artificial +intelligent (AI) algorithms, smart robotic technology is the potential solution +to provide construction resources after an earthquake. In this paper, the +robotic crane with advanced AI algorithms is proposed to provide resources for +infrastructure reconstruction after an earthquake. The proximal policy +optimization (PPO), a reinforcement learning (RL) algorithm, is implemented for +3D lift path planning when transporting the construction materials. The state +and reward function are designed in detail for RL model training. Two models +are trained through a loading task in different environments by using PPO +algorithm, one considering the influence of obstacles and the other not +considering obstacles. Then, the two trained models are compared and evaluated +through an unloading task and a loading task in simulation environments. For +each task, two different cases are considered. One is that there is no obstacle +between the initial position where the construction material is lifted and the +target position, and the other is that there are obstacles between the initial +position and the target position. The results show that the model that +considering the obstacles during training can generate proper actions for the +robotic crane to execute so that the crane can automatically transport the +construction materials to the desired location with swing suppression, short +time consumption and collision avoidance. + +
+
+ comment: 12 pages, 7 figures, accepted in the Canadian Conference - Pacific + Conference on Earthquake Engineering 2023, Vancouver, British Columbia +
+
+
+
+
+ + ☆ Autonomous damage assessment of structural columns using low-cost micro + aerial vehicles and multi-view computer vision + + +
+ Structural columns are the crucial load-carrying components of buildings and +bridges. Early detection of column damage is important for the assessment of +the residual performance and the prevention of system-level collapse. This +research proposes an innovative end-to-end micro aerial vehicles (MAVs)-based +approach to automatically scan and inspect columns. First, an MAV-based +automatic image collection method is proposed. The MAV is programmed to sense +the structural columns and their surrounding environment. During the +navigation, the MAV first detects and approaches the structural columns. Then, +it starts to collect image data at multiple viewpoints around every detected +column. Second, the collected images will be used to assess the damage types +and damage locations. Third, the damage state of the structural column will be +determined by fusing the evaluation outcomes from multiple camera views. In +this study, reinforced concrete (RC) columns are selected to demonstrate the +effectiveness of the approach. Experimental results indicate that the proposed +MAV-based inspection approach can effectively collect images from multiple +viewing angles, and accurately assess critical RC column damages. The approach +improves the level of autonomy during the inspection. In addition, the +evaluation outcomes are more comprehensive than the existing 2D vision methods. +The concept of the proposed inspection approach can be extended to other +structural columns such as bridge piers. + +
+
+ comment: 12 pages, 11 figures, accepted in the Canadian Conference - Pacific + Conference on Earthquake Engineering 2023, Vancouver, British Columbia +
+
+
+
+
+ + ☆ Learning Diverse Features in Vision Transformers for Improved + Generalization ICML + + +
+ Deep learning models often rely only on a small set of features even when +there is a rich set of predictive signals in the training data. This makes +models brittle and sensitive to distribution shifts. In this work, we first +examine vision transformers (ViTs) and find that they tend to extract robust +and spurious features with distinct attention heads. As a result of this +modularity, their performance under distribution shifts can be significantly +improved at test time by pruning heads corresponding to spurious features, +which we demonstrate using an "oracle selection" on validation data. Second, we +propose a method to further enhance the diversity and complementarity of the +learned features by encouraging orthogonality of the attention heads' input +gradients. We observe improved out-of-distribution performance on diagnostic +benchmarks (MNIST-CIFAR, Waterbirds) as a consequence of the enhanced diversity +of features and the pruning of undesirable heads. + +
+
+ comment: 2023 ICML Workshop on Spurious Correlations, Invariance and Stability +
+
+
+
+
+ + ☆ Emergence of Segmentation with Minimalistic White-Box Transformers + + +
+ Transformer-like models for vision tasks have recently proven effective for a +wide range of downstream applications such as segmentation and detection. +Previous works have shown that segmentation properties emerge in vision +transformers (ViTs) trained using self-supervised methods such as DINO, but not +in those trained on supervised classification tasks. In this study, we probe +whether segmentation emerges in transformer-based models solely as a result of +intricate self-supervised learning mechanisms, or if the same emergence can be +achieved under much broader conditions through proper design of the model +architecture. Through extensive experimental results, we demonstrate that when +employing a white-box transformer-like architecture known as CRATE, whose +design explicitly models and pursues low-dimensional structures in the data +distribution, segmentation properties, at both the whole and parts levels, +already emerge with a minimalistic supervised training recipe. Layer-wise +finer-grained analysis reveals that the emergent properties strongly +corroborate the designed mathematical functions of the white-box network. Our +results suggest a path to design white-box foundation models that are +simultaneously highly performant and mathematically fully interpretable. Code +is at \url{https://github.com/Ma-Lab-Berkeley/CRATE}. + +
+
+ comment: Code: https://github.com/Ma-Lab-Berkeley/CRATE +
+
+
+
+
+ + ☆ Can Prompt Learning Benefit Radiology Report Generation? + + +
+ Radiology report generation aims to automatically provide clinically +meaningful descriptions of radiology images such as MRI and X-ray. Although +great success has been achieved in natural scene image captioning tasks, +radiology report generation remains challenging and requires prior medical +knowledge. In this paper, we propose PromptRRG, a method that utilizes prompt +learning to activate a pretrained model and incorporate prior knowledge. Since +prompt learning for radiology report generation has not been explored before, +we begin with investigating prompt designs and categorise them based on varying +levels of knowledge: common, domain-specific and disease-enriched prompts. +Additionally, we propose an automatic prompt learning mechanism to alleviate +the burden of manual prompt engineering. This is the first work to +systematically examine the effectiveness of prompt learning for radiology +report generation. Experimental results on the largest radiology report +generation benchmark, MIMIC-CXR, demonstrate that our proposed method achieves +state-of-the-art performance. Code will be available upon the acceptance. + +
+
+ comment: 8 pages with 6 pages supplementary file +
+
+
+
+
+ + ☆ Robust Principles: Architectural Design Principles for Adversarially + Robust CNNs BMVC'23 + + +
+ Our research aims to unify existing works' diverging opinions on how +architectural components affect the adversarial robustness of CNNs. To +accomplish our goal, we synthesize a suite of three generalizable robust +architectural design principles: (a) optimal range for depth and width +configurations, (b) preferring convolutional over patchify stem stage, and (c) +robust residual block design through adopting squeeze and excitation blocks and +non-parametric smooth activation functions. Through extensive experiments +across a wide spectrum of dataset scales, adversarial training methods, model +parameters, and network design spaces, our principles consistently and markedly +improve AutoAttack accuracy: 1-3 percentage points (pp) on CIFAR-10 and +CIFAR-100, and 4-9 pp on ImageNet. The code is publicly available at +https://github.com/poloclub/robust-principles. + +
+
+ comment: Published at BMVC'23 +
+
+
+
+
+ + ☆ Active Neural Mapping ICCV 2023 + + +
+ We address the problem of active mapping with a continually-learned neural +scene representation, namely Active Neural Mapping. The key lies in actively +finding the target space to be explored with efficient agent movement, thus +minimizing the map uncertainty on-the-fly within a previously unseen +environment. In this paper, we examine the weight space of the +continually-learned neural field, and show empirically that the neural +variability, the prediction robustness against random weight perturbation, can +be directly utilized to measure the instant uncertainty of the neural map. +Together with the continuous geometric information inherited in the neural map, +the agent can be guided to find a traversable path to gradually gain knowledge +of the environment. We present for the first time an active mapping system with +a coordinate-based implicit neural representation for online scene +reconstruction. Experiments in the visually-realistic Gibson and Matterport3D +environment demonstrate the efficacy of the proposed method. + +
+
+ comment: ICCV 2023, project page: + https://zikeyan.github.io/active-INR/index.html +
+
+
+
+
+ + ☆ Deep Video Codec Control + + +
+ Lossy video compression is commonly used when transmitting and storing video +data. Unified video codecs (e.g., H.264 or H.265) remain the \emph{de facto} +standard, despite the availability of advanced (neural) compression approaches. +Transmitting videos in the face of dynamic network bandwidth conditions +requires video codecs to adapt to vastly different compression strengths. Rate +control modules augment the codec's compression such that bandwidth constraints +are satisfied and video distortion is minimized. While, both standard video +codes and their rate control modules are developed to minimize video distortion +w.r.t. human quality assessment, preserving the downstream performance of deep +vision models is not considered. In this paper, we present the first end-to-end +learnable deep video codec control considering both bandwidth constraints and +downstream vision performance, while not breaking existing standardization. We +demonstrate for two common vision tasks (semantic segmentation and optical flow +estimation) and on two different datasets that our deep codec control better +preserves downstream performance than using 2-pass average bit rate control +while meeting dynamic bandwidth constraints and adhering to standardizations. + +
+
+ comment: 22 pages, 26 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Going Beyond Nouns With Vision & Language Models Using Synthetic Data ICCV 2023 + + +
+ Large-scale pre-trained Vision & Language (VL) models have shown remarkable +performance in many applications, enabling replacing a fixed set of supported +classes with zero-shot open vocabulary reasoning over (almost arbitrary) +natural language prompts. However, recent works have uncovered a fundamental +weakness of these models. For example, their difficulty to understand Visual +Language Concepts (VLC) that go 'beyond nouns' such as the meaning of +non-object words (e.g., attributes, actions, relations, states, etc.), or +difficulty in performing compositional reasoning such as understanding the +significance of the order of the words in a sentence. In this work, we +investigate to which extent purely synthetic data could be leveraged to teach +these models to overcome such shortcomings without compromising their zero-shot +capabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale +synthetic dataset and data generation codebase allowing to generate additional +suitable data to improve VLC understanding and compositional reasoning of VL +models. Additionally, we propose a general VL finetuning strategy for +effectively leveraging SyViC towards achieving these improvements. Our +extensive experiments and ablations on VL-Checklist, Winoground, and ARO +benchmarks demonstrate that it is possible to adapt strong pre-trained VL +models with synthetic data significantly enhancing their VLC understanding +(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their +zero-shot accuracy. + +
+
+ comment: Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/ +
+
+
+
+
+ + ♻ ☆ CartiMorph: a framework for automated knee articular cartilage + morphometrics + + +
+ We introduce CartiMorph, a framework for automated knee articular cartilage +morphometrics. It takes an image as input and generates quantitative metrics +for cartilage subregions, including the percentage of full-thickness cartilage +loss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the +power of deep learning models for hierarchical image feature representation. +Deep learning models were trained and validated for tissue segmentation, +template construction, and template-to-image registration. We established +methods for surface-normal-based cartilage thickness mapping, FCL estimation, +and rule-based cartilage parcellation. Our cartilage thickness map showed less +error in thin and peripheral regions. We evaluated the effectiveness of the +adopted segmentation model by comparing the quantitative metrics obtained from +model segmentation and those from manual segmentation. The root-mean-squared +deviation of the FCL measurements was less than 8%, and strong correlations +were observed for the mean thickness (Pearson's correlation coefficient $\rho +\in [0.82,0.97]$), surface area ($\rho \in [0.82,0.98]$) and volume ($\rho \in +[0.89,0.98]$) measurements. We compared our FCL measurements with those from a +previous study and found that our measurements deviated less from the ground +truths. We observed superior performance of the proposed rule-based cartilage +parcellation method compared with the atlas-based approach. CartiMorph has the +potential to promote imaging biomarkers discovery for knee osteoarthritis. + +
+
+ comment: To be published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Uncertainty-Aware Source-Free Adaptive Image Super-Resolution with + Wavelet Augmentation Transformer + + +
+ Unsupervised Domain Adaptation (UDA) can effectively address domain gap +issues in real-world image Super-Resolution (SR) by accessing both the source +and target data. Considering privacy policies or transmission restrictions of +source data in practical scenarios, we propose a SOurce-free Domain Adaptation +framework for image SR (SODA-SR) to address this issue, i.e., adapt a +source-trained model to a target domain with only unlabeled target data. +SODA-SR leverages the source-trained model to generate refined pseudo-labels +for teacher-student learning. To better utilize pseudo-labels, we propose a +novel wavelet-based augmentation method, named Wavelet Augmentation Transformer +(WAT), which can be flexibly incorporated with existing networks, to implicitly +produce useful augmented data. WAT learns low-frequency information of varying +levels across diverse samples, which is aggregated efficiently via deformable +attention. Furthermore, an uncertainty-aware self-training mechanism is +proposed to improve the accuracy of pseudo-labels, with inaccurate predictions +being rectified by uncertainty estimation. To acquire better SR results and +avoid overfitting pseudo-labels, several regularization losses are proposed to +constrain target LR and SR images in the frequency domain. Experiments show +that without accessing source data, SODA-SR outperforms state-of-the-art UDA +methods in both synthetic$\rightarrow$real and real$\rightarrow$real adaptation +settings, and is not constrained by specific network architectures. + +
+
+ comment: 9 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Conditioning Diffusion Models via Attributes and Semantic Masks for Face + Generation + + +
+ Deep generative models have shown impressive results in generating realistic +images of faces. GANs managed to generate high-quality, high-fidelity images +when conditioned on semantic masks, but they still lack the ability to +diversify their output. Diffusion models partially solve this problem and are +able to generate diverse samples given the same condition. In this paper, we +propose a multi-conditioning approach for diffusion models via cross-attention +exploiting both attributes and semantic masks to generate high-quality and +controllable face images. We also studied the impact of applying +perceptual-focused loss weighting into the latent space instead of the pixel +space. Our method extends the previous approaches by introducing conditioning +on more than one set of features, guaranteeing a more fine-grained control over +the generated face images. We evaluate our approach on the CelebA-HQ dataset, +and we show that it can generate realistic and diverse samples while allowing +for fine-grained control over multiple attributes and semantic regions. +Additionally, we perform an ablation study to evaluate the impact of different +conditioning strategies on the quality and diversity of the generated images. + +
+
+
+
+
+ + ♻ ☆ What You Hear Is What You See: Audio Quality Metrics From Image Quality + Metrics + + +
+ In this study, we investigate the feasibility of utilizing state-of-the-art +image perceptual metrics for evaluating audio signals by representing them as +spectrograms. The encouraging outcome of the proposed approach is based on the +similarity between the neural mechanisms in the auditory and visual pathways. +Furthermore, we customise one of the metrics which has a psychoacoustically +plausible architecture to account for the peculiarities of sound signals. We +evaluate the effectiveness of our proposed metric and several baseline metrics +using a music dataset, with promising results in terms of the correlation +between the metrics and the perceived quality of audio as rated by human +evaluators. + +
+
+
+
+
+ + ♻ ☆ Context-VQA: Towards Context-Aware and Purposeful Visual Question + Answering ICCV 2023 + + +
+ Visual question answering (VQA) has the potential to make the Internet more +accessible in an interactive way, allowing people who cannot see images to ask +questions about them. However, multiple studies have shown that people who are +blind or have low-vision prefer image explanations that incorporate the context +in which an image appears, yet current VQA datasets focus on images in +isolation. We argue that VQA models will not fully succeed at meeting people's +needs unless they take context into account. To further motivate and analyze +the distinction between different contexts, we introduce Context-VQA, a VQA +dataset that pairs images with contexts, specifically types of websites (e.g., +a shopping website). We find that the types of questions vary systematically +across contexts. For example, images presented in a travel context garner 2 +times more "Where?" questions, and images on social media and news garner 2.8 +and 1.8 times more "Who?" questions than the average. We also find that context +effects are especially important when participants can't see the image. These +results demonstrate that context affects the types of questions asked and that +VQA models should be context-sensitive to better meet people's needs, +especially in accessibility settings. + +
+
+ comment: Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision + and Language +
+
+
+
+
+ + ♻ ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ NeXtQSM -- A complete deep learning pipeline for data-consistent + quantitative susceptibility mapping trained with hybrid data + + +
+ Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great +potential in recent years, obtaining similar results to established +non-learning approaches. Many current deep learning approaches are not data +consistent, require in vivo training data or solve the QSM problem in +consecutive steps resulting in the propagation of errors. Here we aim to +overcome these limitations and developed a framework to solve the QSM +processing steps jointly. We developed a new hybrid training data generation +method that enables the end-to-end training for solving background field +correction and dipole inversion in a data-consistent fashion using a +variational network that combines the QSM model term and a learned regularizer. +We demonstrate that NeXtQSM overcomes the limitations of previous deep learning +methods. NeXtQSM offers a new deep learning based pipeline for computing +quantitative susceptibility maps that integrates each processing step into the +training and provides results that are robust and fast. + +
+
+
+
+
+ + ♻ ☆ TAPIR: Tracking Any Point with per-frame Initialization and temporal + Refinement ICCV 2023 + + +
+ We present a novel model for Tracking Any Point (TAP) that effectively tracks +any queried point on any physical surface throughout a video sequence. Our +approach employs two stages: (1) a matching stage, which independently locates +a suitable candidate point match for the query point on every other frame, and +(2) a refinement stage, which updates both the trajectory and query features +based on local correlations. The resulting model surpasses all baseline methods +by a significant margin on the TAP-Vid benchmark, as demonstrated by an +approximate 20% absolute average Jaccard (AJ) improvement on DAVIS. Our model +facilitates fast inference on long and high-resolution video sequences. On a +modern GPU, our implementation has the capacity to track points faster than +real-time, and can be flexibly extended to higher-resolution videos. Given the +high-quality trajectories extracted from a large dataset, we demonstrate a +proof-of-concept diffusion model which generates trajectories from static +images, enabling plausible animations. Visualizations, source code, and +pretrained models can be found on our project webpage. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ TriangleNet: Edge Prior Augmented Network for Semantic Segmentation + through Cross-Task Consistency + + +
+ This paper addresses the task of semantic segmentation in computer vision, +aiming to achieve precise pixel-wise classification. We investigate the joint +training of models for semantic edge detection and semantic segmentation, which +has shown promise. However, implicit cross-task consistency learning in +multi-task networks is limited. To address this, we propose a novel "decoupled +cross-task consistency loss" that explicitly enhances cross-task consistency. +Our semantic segmentation network, TriangleNet, achieves a substantial 2.88\% +improvement over the Baseline in mean Intersection over Union (mIoU) on the +Cityscapes test set. Notably, TriangleNet operates at 77.4\% mIoU/46.2 FPS on +Cityscapes, showcasing real-time inference capabilities at full resolution. +With multi-scale inference, performance is further enhanced to 77.8\%. +Furthermore, TriangleNet consistently outperforms the Baseline on the FloodNet +dataset, demonstrating its robust generalization capabilities. The proposed +method underscores the significance of multi-task learning and explicit +cross-task consistency enhancement for advancing semantic segmentation and +highlights the potential of multitasking in real-time semantic segmentation. + +
+
+ comment: Accepted for publication in the journal "International Journal of + Intelligent Systems" +
+
+
+
+
+ + ♻ ☆ DREAM: Efficient Dataset Distillation by Representative Matching + + +
+ Dataset distillation aims to synthesize small datasets with little +information loss from original large-scale ones for reducing storage and +training costs. Recent state-of-the-art methods mainly constrain the sample +synthesis process by matching synthetic images and the original ones regarding +gradients, embedding distributions, or training trajectories. Although there +are various matching objectives, currently the strategy for selecting original +images is limited to naive random sampling. + We argue that random sampling overlooks the evenness of the selected sample +distribution, which may result in noisy or biased matching targets. + Besides, the sample diversity is also not constrained by random sampling. +These factors together lead to optimization instability in the distilling +process and degrade the training efficiency. Accordingly, we propose a novel +matching strategy named as \textbf{D}ataset distillation by +\textbf{RE}present\textbf{A}tive \textbf{M}atching (DREAM), where only +representative original images are selected for matching. DREAM is able to be +easily plugged into popular dataset distillation frameworks and reduce the +distilling iterations by more than 8 times without performance drop. Given +sufficient training time, DREAM further provides significant improvements and +achieves state-of-the-art performances. + +
+
+ comment: Efficient matching for dataset distillation +
+
+
+
+
+ + ♻ ☆ LAC -- Latent Action Composition for Skeleton-based Action Segmentation ICCV 2023 + + +
+ Skeleton-based action segmentation requires recognizing composable actions in +untrimmed videos. Current approaches decouple this problem by first extracting +local visual features from skeleton sequences and then processing them by a +temporal model to classify frame-wise actions. However, their performances +remain limited as the visual features cannot sufficiently express composable +actions. In this context, we propose Latent Action Composition (LAC), a novel +self-supervised framework aiming at learning from synthesized composable +motions for skeleton-based action segmentation. LAC is composed of a novel +generation module towards synthesizing new sequences. Specifically, we design a +linear latent space in the generator to represent primitive motion. New +composed motions can be synthesized by simply performing arithmetic operations +on latent representations of multiple input skeleton sequences. LAC leverages +such synthesized sequences, which have large diversity and complexity, for +learning visual representations of skeletons in both sequence and frame spaces +via contrastive learning. The resulting visual encoder has a high expressive +power and can be effectively transferred onto action segmentation tasks by +end-to-end fine-tuning without the need for additional temporal models. We +conduct a study focusing on transfer-learning and we show that representations +learned from pre-trained LAC outperform the state-of-the-art by a large margin +on TSU, Charades, PKU-MMD datasets. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Exploring the Benefits of Visual Prompting in Differential Privacy ICCV 2023 + + +
+ Visual Prompting (VP) is an emerging and powerful technique that allows +sample-efficient adaptation to downstream tasks by engineering a well-trained +frozen source model. In this work, we explore the benefits of VP in +constructing compelling neural network classifiers with differential privacy +(DP). We explore and integrate VP into canonical DP training methods and +demonstrate its simplicity and efficiency. In particular, we discover that VP +in tandem with PATE, a state-of-the-art DP training method that leverages the +knowledge transfer from an ensemble of teachers, achieves the state-of-the-art +privacy-utility trade-off with minimum expenditure of privacy budget. Moreover, +we conduct additional experiments on cross-domain image classification with a +sufficient domain gap to further unveil the advantage of VP in DP. Lastly, we +also conduct extensive ablation studies to validate the effectiveness and +contribution of VP under DP consideration. Our code is available at +(https://github.com/EzzzLi/Prompt-PATE). + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DeltaNN: Assessing the Impact of Computational Environment Parameters on + the Performance of Image Recognition Models + + +
+ Image recognition tasks typically use deep learning and require enormous +processing power, thus relying on hardware accelerators like GPUs and TPUs for +fast, timely processing. Failure in real-time image recognition tasks can occur +due to sub-optimal mapping on hardware accelerators during model deployment, +which may lead to timing uncertainty and erroneous behavior. Mapping on +hardware accelerators is done using multiple software components like deep +learning frameworks, compilers, and device libraries, that we refer to as the +computational environment. Owing to the increased use of image recognition +tasks in safety-critical applications like autonomous driving and medical +imaging, it is imperative to assess their robustness to changes in the +computational environment, as the impact of parameters like deep learning +frameworks, compiler optimizations, and hardware devices on model performance +and correctness is not yet well understood. + In this paper we present a differential testing framework, DeltaNN, that +allows us to assess the impact of different computational environment +parameters on the performance of image recognition models during deployment, +post training. DeltaNN generates different implementations of a given image +recognition model for variations in environment parameters, namely, deep +learning frameworks, compiler optimizations and hardware devices and analyzes +differences in model performance as a result. Using DeltaNN, we conduct an +empirical study of robustness analysis of three popular image recognition +models using the ImageNet dataset. We report the impact in terms of +misclassifications and inference time differences across different settings. In +total, we observed up to 72% output label differences across deep learning +frameworks, and up to 81% unexpected performance degradation in terms of +inference time, when applying compiler optimizations. + +
+
+ comment: 11 pages, 10 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Laughing Matters: Introducing Laughing-Face Generation using Diffusion + Models + + +
+ Speech-driven animation has gained significant traction in recent years, with +current methods achieving near-photorealistic results. However, the field +remains underexplored regarding non-verbal communication despite evidence +demonstrating its importance in human interaction. In particular, generating +laughter sequences presents a unique challenge due to the intricacy and nuances +of this behaviour. This paper aims to bridge this gap by proposing a novel +model capable of generating realistic laughter sequences, given a still +portrait and an audio clip containing laughter. We highlight the failure cases +of traditional facial animation methods and leverage recent advances in +diffusion models to produce convincing laughter videos. We train our model on a +diverse set of laughter datasets and introduce an evaluation metric +specifically designed for laughter. When compared with previous speech-driven +approaches, our model achieves state-of-the-art performance across all metrics, +even when these are re-trained for laughter generation. Our code and project +are publicly available + +
+
+
+
+
+ + ♻ ☆ Discriminator-free Unsupervised Domain Adaptation for Multi-label Image + Classification + + +
+ In this paper, a discriminator-free adversarial-based Unsupervised Domain +Adaptation (UDA) for Multi-Label Image Classification (MLIC) referred to as +DDA-MLIC is proposed. Recently, some attempts have been made for introducing +adversarial-based UDA methods in the context of MLIC. However, these methods +which rely on an additional discriminator subnet present one major shortcoming. +The learning of domain-invariant features may harm their task-specific +discriminative power, since the classification and discrimination tasks are +decoupled. Herein, we propose to overcome this issue by introducing a novel +adversarial critic that is directly deduced from the task-specific classifier. +Specifically, a two-component Gaussian Mixture Model (GMM) is fitted on the +source and target predictions in order to distinguish between two clusters. +This allows extracting a Gaussian distribution for each component. The +resulting Gaussian distributions are then used for formulating an adversarial +loss based on a Frechet distance. The proposed method is evaluated on several +multi-label image datasets covering three different types of domain shift. The +obtained results demonstrate that DDA-MLIC outperforms existing +state-of-the-art methods in terms of precision while requiring a lower number +of parameters. The code will be made publicly available online. + +
+
+
+
+
+ + ♻ ☆ Fault Localization for Buggy Deep Learning Framework Conversions in + Image Recognition + + +
+ When deploying Deep Neural Networks (DNNs), developers often convert models +from one deep learning framework to another (e.g., TensorFlow to PyTorch). +However, this process is error-prone and can impact target model accuracy. To +identify the extent of such impact, we perform and briefly present a +differential analysis against three DNNs widely used for image recognition +(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep +learning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which +revealed numerous model crashes and output label discrepancies of up to 72%. To +mitigate such errors, we present a novel approach towards fault localization +and repair of buggy deep learning framework conversions, focusing on +pre-trained image recognition models. Our technique consists of four stages of +analysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters, +and 4) graph representation. In addition, we propose various strategies towards +fault repair of the faults detected. We implement our technique on top of the +Apache TVM deep learning compiler, and we test it by conducting a preliminary +fault localization analysis for the conversion of InceptionV3 from TF to +TFLite. Our approach detected a fault in a common DNN converter tool, which +introduced precision errors in weights, reducing model accuracy. After our +fault localization, we repaired the issue, reducing our conversion error to +zero. + +
+
+ comment: 5 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Nonrigid Object Contact Estimation With Regional Unwrapping Transformer ICCV2023 + + +
+ Acquiring contact patterns between hands and nonrigid objects is a common +concern in the vision and robotics community. However, existing learning-based +methods focus more on contact with rigid ones from monocular images. When +adopting them for nonrigid contact, a major problem is that the existing +contact representation is restricted by the geometry of the object. +Consequently, contact neighborhoods are stored in an unordered manner and +contact features are difficult to align with image cues. At the core of our +approach lies a novel hand-object contact representation called RUPs (Region +Unwrapping Profiles), which unwrap the roughly estimated hand-object surfaces +as multiple high-resolution 2D regional profiles. The region grouping strategy +is consistent with the hand kinematic bone division because they are the +primitive initiators for a composite contact pattern. Based on this +representation, our Regional Unwrapping Transformer (RUFormer) learns the +correlation priors across regions from monocular inputs and predicts +corresponding contact and deformed transformations. Our experiments demonstrate +that the proposed framework can robustly estimate the deformed degrees and +deformed transformations, which makes it suitable for both nonrigid and rigid +contact. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ How Good is Google Bard's Visual Understanding? An Empirical Study on + Open Challenges + + +
+ Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in +the field of conversational AI. Notably, Bard has recently been updated to +handle visual inputs alongside text prompts during conversations. Given Bard's +impressive track record in handling textual inputs, we explore its capabilities +in understanding and interpreting visual data (images) conditioned by text +questions. This exploration holds the potential to unveil new insights and +challenges for Bard and other forthcoming multi-modal Generative models, +especially in addressing complex computer vision problems that demand accurate +visual and language understanding. Specifically, in this study, we focus on 15 +diverse task scenarios encompassing regular, camouflaged, medical, under-water +and remote sensing data to comprehensively evaluate Bard's performance. Our +primary finding indicates that Bard still struggles in these vision scenarios, +highlighting the significant gap in vision-based understanding that needs to be +bridged in future developments. We expect that this empirical study will prove +valuable in advancing future models, leading to enhanced capabilities in +comprehending and interpreting fine-grained visual data. Our project is +released on https://github.com/htqin/GoogleBard-VisUnderstand + +
+
+
+
+
+ + ♻ ☆ MB-TaylorFormer: Multi-branch Efficient Transformer Expanded by Taylor + Formula for Image Dehazing ICCV 2023 + + +
+ In recent years, Transformer networks are beginning to replace pure +convolutional neural networks (CNNs) in the field of computer vision due to +their global receptive field and adaptability to input. However, the quadratic +computational complexity of softmax-attention limits the wide application in +image dehazing task, especially for high-resolution images. To address this +issue, we propose a new Transformer variant, which applies the Taylor expansion +to approximate the softmax-attention and achieves linear computational +complexity. A multi-scale attention refinement module is proposed as a +complement to correct the error of the Taylor expansion. Furthermore, we +introduce a multi-branch architecture with multi-scale patch embedding to the +proposed Transformer, which embeds features by overlapping deformable +convolution of different scales. The design of multi-scale patch embedding is +based on three key ideas: 1) various sizes of the receptive field; 2) +multi-level semantic information; 3) flexible shapes of the receptive field. +Our model, named Multi-branch Transformer expanded by Taylor formula +(MB-TaylorFormer), can embed coarse to fine features more flexibly at the patch +embedding stage and capture long-distance pixel interactions with limited +computational cost. Experimental results on several dehazing benchmarks show +that MB-TaylorFormer achieves state-of-the-art (SOTA) performance with a light +computational burden. The source code and pre-trained models are available at +https://github.com/FVL2020/ICCV-2023-MB-TaylorFormer. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SegViTv2: Exploring Efficient and Continual Semantic Segmentation with + Plain Vision Transformers + + +
+ This paper investigates the capability of plain Vision Transformers (ViTs) +for semantic segmentation using the encoder-decoder framework and introduces +\textbf{SegViTv2}. In this study, we introduce a novel Attention-to-Mask (\atm) +module to design a lightweight decoder effective for plain ViT. The proposed +ATM converts the global attention map into semantic masks for high-quality +segmentation results. Our decoder outperforms the popular decoder UPerNet using +various ViT backbones while consuming only about $5\%$ of the computational +cost. For the encoder, we address the concern of the relatively high +computational cost in the ViT-based encoders and propose a \emph{Shrunk++} +structure that incorporates edge-aware query-based down-sampling (EQD) and +query-based upsampling (QU) modules. The Shrunk++ structure reduces the +computational cost of the encoder by up to $50\%$ while maintaining competitive +performance. Furthermore, we propose to adapt SegViT for continual semantic +segmentation, demonstrating nearly zero forgetting of previously learned +knowledge. Experiments show that our proposed SegViTv2 surpasses recent +segmentation methods on three popular benchmarks including ADE20k, +COCO-Stuff-10k and PASCAL-Context datasets. The code is available through the +following link: \url{https://github.com/zbwxp/SegVit}. + +
+
+ comment: IJCV 2023 accepted, 21 pages, 8 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ BinaryViT: Towards Efficient and Accurate Binary Vision Transformers + + +
+ Vision Transformers (ViTs) have emerged as the fundamental architecture for +most computer vision fields, but the considerable memory and computation costs +hinders their application on resource-limited devices. As one of the most +powerful compression methods, binarization reduces the computation of the +neural network by quantizing the weights and activation values as $\pm$1. +Although existing binarization methods have demonstrated excellent performance +on Convolutional Neural Networks (CNNs), the full binarization of ViTs is still +under-studied and suffering a significant performance drop. In this paper, we +first argue empirically that the severe performance degradation is mainly +caused by the weight oscillation in the binarization training and the +information distortion in the activation of ViTs. Based on these analyses, we +propose $\textbf{BinaryViT}$, an accurate full binarization scheme for ViTs, +which pushes the quantization of ViTs to the limit. Specifically, we propose a +novel gradient regularization scheme (GRS) for driving a bimodal distribution +of the weights to reduce oscillation in binarization training. Moreover, we +design an activation shift module (ASM) to adaptively tune the activation +distribution to reduce the information distortion caused by binarization. +Extensive experiments on ImageNet dataset show that our BinaryViT consistently +surpasses the strong baseline by 2.05% and improve the accuracy of fully +binarized ViTs to a usable level. Furthermore, our method achieves impressive +savings of 16.2$\times$ and 17.7$\times$ in model size and OPs compared to the +full-precision DeiT-S. + +
+
+
+
+
+ + ♻ ☆ Evaluating the Quality and Diversity of DCGAN-based Generatively + Synthesized Diabetic Retinopathy Imagery + + +
+ Publicly available diabetic retinopathy (DR) datasets are imbalanced, +containing limited numbers of images with DR. This imbalance contributes to +overfitting when training machine learning classifiers. The impact of this +imbalance is exacerbated as the severity of the DR stage increases, affecting +the classifiers' diagnostic capacity. The imbalance can be addressed using +Generative Adversarial Networks (GANs) to augment the datasets with synthetic +images. Generating synthetic images is advantageous if high-quality and +diversified images are produced. To evaluate the quality and diversity of +synthetic images, several evaluation metrics, such as Multi-Scale Structural +Similarity Index (MS-SSIM), Cosine Distance (CD), and Fr\'echet Inception +Distance (FID) are used. Understanding the effectiveness of each metric in +evaluating the quality and diversity of GAN-based synthetic images is critical +to select images for augmentation. To date, there has been limited analysis of +the appropriateness of these metrics in the context of biomedical imagery. This +work contributes an empirical assessment of these evaluation metrics as applied +to synthetic Proliferative DR imagery generated by a Deep Convolutional GAN +(DCGAN). Furthermore, the metrics' capacity to indicate the quality and +diversity of synthetic images and a correlation with classifier performance is +undertaken. This enables a quantitative selection of synthetic imagery and an +informed augmentation strategy. Results indicate that FID is suitable for +evaluating the quality, while MS-SSIM and CD are suitable for evaluating the +diversity of synthetic imagery. Furthermore, the superior performance of +Convolutional Neural Network (CNN) and EfficientNet classifiers, as indicated +by the F1 and AUC scores, for the augmented datasets demonstrates the efficacy +of synthetic imagery to augment the imbalanced dataset. + +
+
+ comment: 29 Pages, 8 Figures, submitted to MEDAL23: Advances in Deep + Generative Models for Medical Artificial Intelligence (Springer Nature + series) +
+
+
+
+
+ + ♻ ☆ Food Classification using Joint Representation of Visual and Textual + Data + + +
+ Food classification is an important task in health care. In this work, we +propose a multimodal classification framework that uses the modified version of +EfficientNet with the Mish activation function for image classification, and +the traditional BERT transformer-based network is used for text classification. +The proposed network and the other state-of-the-art methods are evaluated on a +large open-source dataset, UPMC Food-101. The experimental results show that +the proposed network outperforms the other methods, a significant difference of +11.57% and 6.34% in accuracy is observed for image and text classification, +respectively, when compared with the second-best performing method. We also +compared the performance in terms of accuracy, precision, and recall for text +classification using both machine learning and deep learning-based models. The +comparative analysis from the prediction results of both images and text +demonstrated the efficiency and robustness of the proposed approach. + +
+
+ comment: Updated results and discussions to be posted and some sections needed + to be expanded +
+
+
+
+
+ + ♻ ☆ NBV-SC: Next Best View Planning based on Shape Completion for Fruit + Mapping and Reconstruction + + +
+ Active perception for fruit mapping and harvesting is a difficult task since +occlusions occur frequently and the location as well as size of fruits change +over time. State-of-the-art viewpoint planning approaches utilize +computationally expensive ray casting operations to find good viewpoints aiming +at maximizing information gain and covering the fruits in the scene. In this +paper, we present a novel viewpoint planning approach that explicitly uses +information about the predicted fruit shapes to compute targeted viewpoints +that observe as yet unobserved parts of the fruits. Furthermore, we formulate +the concept of viewpoint dissimilarity to reduce the sampling space for more +efficient selection of useful, dissimilar viewpoints. Our simulation +experiments with a UR5e arm equipped with an RGB-D sensor provide a +quantitative demonstration of the efficacy of our iterative next best view +planning method based on shape completion. In comparative experiments with a +state-of-the-art viewpoint planner, we demonstrate improvement not only in the +estimation of the fruit sizes, but also in their reconstruction, while +significantly reducing the planning time. Finally, we show the viability of our +approach for mapping sweet peppers plants with a real robotic system in a +commercial glasshouse. + +
+
+ comment: Agricultural Automation, Viewpoint Planning, Active Perception, Shape + Completion +
+
+
+
+
+ + ♻ ☆ Three-stage binarization of color document images based on discrete + wavelet transform and generative adversarial networks + + +
+ The efficient segmentation of foreground text information from the background +in degraded color document images is a critical challenge in the preservation +of ancient manuscripts. The imperfect preservation of ancient manuscripts over +time has led to various types of degradation, such as staining, yellowing, and +ink seepage, significantly affecting image binarization results. This work +proposes a three-stage method using Generative Adversarial Networks (GAN) for +enhancing and binarizing degraded color document images through Discrete +Wavelet Transform (DWT). Stage-1 involves applying DWT and retaining the +Low-Low (LL) subband images for image enhancement. In Stage-2, the original +input image is divided into four single-channel images (Red, Green, Blue, and +Gray), and each is trained with independent adversarial networks to extract +color foreground information. In Stage-3, the output image from Stage-2 and the +original input image are used to train independent adversarial networks for +document binarization, enabling the integration of global and local features. +The experimental results demonstrate that our proposed method outperforms other +classic and state-of-the-art (SOTA) methods on the Document Image Binarization +Contest (DIBCO) datasets. We have released our implementation code at +https://github.com/abcpp12383/ThreeStageBinarization. + +
+
+
+
+
+ + ♻ ☆ Implicit neural representation for change detection + + +
+ Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained +during two distinct time periods over the same geographic region presents a +significant challenge due to the disparities in spatial coverage and the +presence of noise in the acquisition system. The most commonly used approaches +to detecting changes in point clouds are based on supervised methods which +necessitate extensive labelled data often unavailable in real-world +applications. To address these issues, we propose an unsupervised approach that +comprises two components: Implicit Neural Representation (INR) for continuous +shape reconstruction and a Gaussian Mixture Model for categorising changes. INR +offers a grid-agnostic representation for encoding bi-temporal point clouds, +with unmatched spatial support that can be regularised to enhance +high-frequency details and reduce noise. The reconstructions at each timestamp +are compared at arbitrary spatial scales, leading to a significant increase in +detection capabilities. We apply our method to a benchmark dataset comprising +simulated LiDAR point clouds for urban sprawling. This dataset encompasses +diverse challenging scenarios, varying in resolutions, input modalities and +noise levels. This enables a comprehensive multi-scenario evaluation, comparing +our method with the current state-of-the-art approach. We outperform the +previous methods by a margin of 10% in the intersection over union metric. In +addition, we put our techniques to practical use by applying them in a +real-world scenario to identify instances of illicit excavation of +archaeological sites and validate our results by comparing them with findings +from field experts. + +
+
+ comment: Main article is 10 pages + 6 pages of supplementary. Conference style + paper +
+
+
+
+
+ + ♻ ☆ What do neural networks learn in image classification? A frequency + shortcut perspective ICCV2023 + + +
+ Frequency analysis is useful for understanding the mechanisms of +representation learning in neural networks (NNs). Most research in this area +focuses on the learning dynamics of NNs for regression tasks, while little for +classification. This study empirically investigates the latter and expands the +understanding of frequency shortcuts. First, we perform experiments on +synthetic datasets, designed to have a bias in different frequency bands. Our +results demonstrate that NNs tend to find simple solutions for classification, +and what they learn first during training depends on the most distinctive +frequency characteristics, which can be either low- or high-frequencies. +Second, we confirm this phenomenon on natural images. We propose a metric to +measure class-wise frequency characteristics and a method to identify frequency +shortcuts. The results show that frequency shortcuts can be texture-based or +shape-based, depending on what best simplifies the objective. Third, we +validate the transferability of frequency shortcuts on out-of-distribution +(OOD) test sets. Our results suggest that frequency shortcuts can be +transferred across datasets and cannot be fully avoided by larger model +capacity and data augmentation. We recommend that future research should focus +on effective training schemes mitigating frequency shortcut learning. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Poincaré ResNet + + +
+ This paper introduces an end-to-end residual network that operates entirely +on the Poincar\'e ball model of hyperbolic space. Hyperbolic learning has +recently shown great potential for visual understanding, but is currently only +performed in the penultimate layer(s) of deep networks. All visual +representations are still learned through standard Euclidean networks. In this +paper we investigate how to learn hyperbolic representations of visual data +directly from the pixel-level. We propose Poincar\'e ResNet, a hyperbolic +counterpart of the celebrated residual network, starting from Poincar\'e 2D +convolutions up to Poincar\'e residual connections. We identify three +roadblocks for training convolutional networks entirely in hyperbolic space and +propose a solution for each: (i) Current hyperbolic network initializations +collapse to the origin, limiting their applicability in deeper networks. We +provide an identity-based initialization that preserves norms over many layers. +(ii) Residual networks rely heavily on batch normalization, which comes with +expensive Fr\'echet mean calculations in hyperbolic space. We introduce +Poincar\'e midpoint batch normalization as a faster and equally effective +alternative. (iii) Due to the many intermediate operations in Poincar\'e +layers, we lastly find that the computation graphs of deep learning libraries +blow up, limiting our ability to train on deep hyperbolic networks. We provide +manual backward derivations of core hyperbolic operations to maintain +manageable computation graphs. + +
+
+ comment: International Conference on Computer Vision 2023 +
+
+
+
+
+ + ♻ ☆ Dynamic Depth-Supervised NeRF for Multi-View RGB-D Operating Room Images + + +
+ The operating room (OR) is an environment of interest for the development of +sensing systems, enabling the detection of people, objects, and their semantic +relations. Due to frequent occlusions in the OR, these systems often rely on +input from multiple cameras. While increasing the number of cameras generally +increases algorithm performance, there are hard limitations to the number and +locations of cameras in the OR. Neural Radiance Fields (NeRF) can be used to +render synthetic views from arbitrary camera positions, virtually enlarging the +number of cameras in the dataset. In this work, we explore the use of NeRF for +view synthesis of dynamic scenes in the OR, and we show that regularisation +with depth supervision from RGB-D sensor data results in higher image quality. +We optimise a dynamic depth-supervised NeRF with up to six synchronised cameras +that capture the surgical field in five distinct phases before and during a +knee replacement surgery. We qualitatively inspect views rendered by a virtual +camera that moves 180 degrees around the surgical field at differing time +values. Quantitatively, we evaluate view synthesis from an unseen camera +position in terms of PSNR, SSIM and LPIPS for the colour channels and in MAE +and error percentage for the estimated depth. We find that NeRFs can be used to +generate geometrically consistent views, also from interpolated camera +positions and at interpolated time intervals. Views are generated from an +unseen camera pose with an average PSNR of 18.2 and a depth estimation error of +2.0%. Our results show the potential of a dynamic NeRF for view synthesis in +the OR and stress the relevance of depth supervision in a clinical setting. + +
+
+ comment: Accepted to the Workshop on Ambient Intelligence for HealthCare 2023 +
+
+
+
+
+ + ♻ ☆ NSF: Neural Surface Fields for Human Modeling from Monocular Depth ICCV 2023 + + +
+ Obtaining personalized 3D animatable avatars from a monocular camera has +several real world applications in gaming, virtual try-on, animation, and +VR/XR, etc. However, it is very challenging to model dynamic and fine-grained +clothing deformations from such sparse data. Existing methods for modeling 3D +humans from depth data have limitations in terms of computational efficiency, +mesh coherency, and flexibility in resolution and topology. For instance, +reconstructing shapes using implicit functions and extracting explicit meshes +per frame is computationally expensive and cannot ensure coherent meshes across +frames. Moreover, predicting per-vertex deformations on a pre-designed human +template with a discrete surface lacks flexibility in resolution and topology. +To overcome these limitations, we propose a novel method `\keyfeature: Neural +Surface Fields' for modeling 3D clothed humans from monocular depth. NSF +defines a neural field solely on the base surface which models a continuous and +flexible displacement field. NSF can be adapted to the base surface with +different resolution and topology without retraining at inference time. +Compared to existing approaches, our method eliminates the expensive per-frame +surface extraction while maintaining mesh coherency, and is capable of +reconstructing meshes with arbitrary resolution without retraining. To foster +research in this direction, we release our code in project page at: +https://yuxuan-xue.com/nsf. + +
+
+ comment: Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf +
+
+
+
+
+ + ♻ ☆ Collaborative Perception in Autonomous Driving: Methods, Datasets and + Challenges + + +
+ Collaborative perception is essential to address occlusion and sensor failure +issues in autonomous driving. In recent years, theoretical and experimental +investigations of novel works for collaborative perception have increased +tremendously. So far, however, few reviews have focused on systematical +collaboration modules and large-scale collaborative perception datasets. This +work reviews recent achievements in this field to bridge this gap and motivate +future research. We start with a brief overview of collaboration schemes. After +that, we systematically summarize the collaborative perception methods for +ideal scenarios and real-world issues. The former focuses on collaboration +modules and efficiency, and the latter is devoted to addressing the problems in +actual application. Furthermore, we present large-scale public datasets and +summarize quantitative results on these benchmarks. Finally, we highlight gaps +and overlook challenges between current academic research and real-world +applications. The project page is +https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving + +
+
+ comment: 18 pages, 6 figures. Accepted by IEEE Intelligent Transportation + Systems Magazine. URL: + https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ♻ ☆ Case-Aware Adversarial Training + + +
+ The neural network (NN) becomes one of the most heated type of models in +various signal processing applications. However, NNs are extremely vulnerable +to adversarial examples (AEs). To defend AEs, adversarial training (AT) is +believed to be the most effective method while due to the intensive +computation, AT is limited to be applied in most applications. In this paper, +to resolve the problem, we design a generic and efficient AT improvement +scheme, namely case-aware adversarial training (CAT). Specifically, the +intuition stems from the fact that a very limited part of informative samples +can contribute to most of model performance. Alternatively, if only the most +informative AEs are used in AT, we can lower the computation complexity of AT +significantly as maintaining the defense effect. To achieve this, CAT achieves +two breakthroughs. First, a method to estimate the information degree of +adversarial examples is proposed for AE filtering. Second, to further enrich +the information that the NN can obtain from AEs, CAT involves a weight +estimation and class-level balancing based sampling strategy to increase the +diversity of AT at each iteration. Extensive experiments show that CAT is +faster than vanilla AT by up to 3x while achieving competitive defense effect. + +
+
+
+
+
+ + ♻ ☆ Tranfer Learning of Semantic Segmentation Methods for Identifying Buried + Archaeological Structures on LiDAR Data + + +
+ When applying deep learning to remote sensing data in archaeological +research, a notable obstacle is the limited availability of suitable datasets +for training models. The application of transfer learning is frequently +employed to mitigate this drawback. However, there is still a need to explore +its effectiveness when applied across different archaeological datasets. This +paper compares the performance of various transfer learning configurations +using two semantic segmentation deep neural networks on two LiDAR datasets. The +experimental results indicate that transfer learning-based approaches in +archaeology can lead to performance improvements, although a systematic +enhancement has not yet been observed. We provide specific insights about the +validity of such techniques that can serve as a baseline for future works. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2023 (IGARSS 2023) @IEEE copyright +
+
+
+
+
+ + ♻ ☆ Is Complexity Required for Neural Network Pruning? A Case Study on + Global Magnitude Pruning + + +
+ Pruning neural networks has become popular in the last decade when it was +shown that a large number of weights can be safely removed from modern neural +networks without compromising accuracy. Numerous pruning methods have been +proposed since then, each claiming to be better than the previous. Many +state-of-the-art (SOTA) techniques today rely on complex pruning methodologies +utilizing importance scores, getting feedback through back-propagation or +having heuristics-based pruning rules amongst others. In this work, we question +whether this pattern of introducing complexity is really necessary to achieve +better pruning results. We benchmark these SOTA techniques against a naive +pruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks +weights in order of their magnitudes and prunes the smallest ones. Hence, in +its vanilla form, it is one of the simplest pruning techniques. Surprisingly, +we find that vanilla Global MP outperforms all the other SOTA techniques and +achieves a new SOTA result. It also achieves promising performance on FLOPs +sparsification, which we find is enhanced, when pruning is conducted in a +gradual fashion. We also find that Global MP is generalizable across tasks, +datasets, and models with superior performance. Moreover, a common issue that +many pruning algorithms run into at high sparsity rates, namely, +layer-collapse, can be easily fixed in Global MP by setting a minimum threshold +of weights to be retained in each layer. Lastly, unlike many other SOTA +techniques, Global MP does not require any additional algorithm specific +hyper-parameters and is very straightforward to tune and implement. We showcase +our findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1 +and FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is +available at https://github.com/manasgupta-1/GlobalMP. + +
+
+
+
+
+ + ♻ ☆ HHTrack: Hyperspectral Object Tracking Using Hybrid Attention + + +
+ Hyperspectral imagery provides abundant spectral information beyond the +visible RGB bands, offering rich discriminative details about objects in a +scene. Leveraging such data has the potential to enhance visual tracking +performance. In this paper, we propose a hyperspectral object tracker based on +hybrid attention (HHTrack). The core of HHTrack is a hyperspectral hybrid +attention (HHA) module that unifies feature extraction and fusion within one +component through token interactions. A hyperspectral bands fusion (HBF) module +is also introduced to selectively aggregate spatial and spectral signatures +from the full hyperspectral input. Extensive experiments demonstrate the +state-of-the-art performance of HHTrack on benchmark Near Infrared (NIR), Red +Near Infrared (Red-NIR), and Visible (VIS) hyperspectral tracking datasets. Our +work provides new insights into harnessing the strengths of transformers and +hyperspectral fusion to advance robust object tracking. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised Semantic Segmentation with Mutual Knowledge Distillation + + +
+ Consistency regularization has been widely studied in recent semisupervised +semantic segmentation methods, and promising performance has been achieved. In +this work, we propose a new consistency regularization framework, termed mutual +knowledge distillation (MKD), combined with data and feature augmentation. We +introduce two auxiliary mean-teacher models based on consistency +regularization. More specifically, we use the pseudo-labels generated by a mean +teacher to supervise the student network to achieve a mutual knowledge +distillation between the two branches. In addition to using image-level strong +and weak augmentation, we also discuss feature augmentation. This involves +considering various sources of knowledge to distill the student network. Thus, +we can significantly increase the diversity of the training samples. +Experiments on public benchmarks show that our framework outperforms previous +state-of-the-art (SOTA) methods under various semi-supervised settings. Code is +available at semi-mmseg. + +
+
+
+
+
+ + ♻ ☆ Efficient Adaptive Ensembling for Image Classification + + +
+ In recent times, with the exception of sporadic cases, the trend in Computer +Vision is to achieve minor improvements compared to considerable increases in +complexity. + To reverse this trend, we propose a novel method to boost image +classification performances without increasing complexity. + To this end, we revisited ensembling, a powerful approach, often not used +properly due to its more complex nature and the training time, so as to make it +feasible through a specific design choice. First, we trained two +EfficientNet-b0 end-to-end models (known to be the architecture with the best +overall accuracy/complexity trade-off for image classification) on disjoint +subsets of data (i.e. bagging). Then, we made an efficient adaptive ensemble by +performing fine-tuning of a trainable combination layer. In this way, we were +able to outperform the state-of-the-art by an average of 0.5$\%$ on the +accuracy, with restrained complexity both in terms of the number of parameters +(by 5-60 times), and the FLoating point Operations Per Second (FLOPS) by 10-100 +times on several major benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ Scene Matters: Model-based Deep Video Compression + + +
+ Video compression has always been a popular research area, where many +traditional and deep video compression methods have been proposed. These +methods typically rely on signal prediction theory to enhance compression +performance by designing high efficient intra and inter prediction strategies +and compressing video frames one by one. In this paper, we propose a novel +model-based video compression (MVC) framework that regards scenes as the +fundamental units for video sequences. Our proposed MVC directly models the +intensity variation of the entire video sequence in one scene, seeking +non-redundant representations instead of reducing redundancy through +spatio-temporal predictions. To achieve this, we employ implicit neural +representation as our basic modeling architecture. To improve the efficiency of +video modeling, we first propose context-related spatial positional embedding +and frequency domain supervision in spatial context enhancement. For temporal +correlation capturing, we design the scene flow constrain mechanism and +temporal contrastive loss. Extensive experimental results demonstrate that our +method achieves up to a 20\% bitrate reduction compared to the latest video +coding standard H.266 and is more efficient in decoding than existing video +coding strategies. + +
+
+
+
+
+ + ♻ ☆ SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend + 3D Talking Faces ACM MM 2023 + + +
+ Speech-driven 3D face animation technique, extending its applications to +various multimedia fields. Previous research has generated promising realistic +lip movements and facial expressions from audio signals. However, traditional +regression models solely driven by data face several essential problems, such +as difficulties in accessing precise labels and domain gaps between different +modalities, leading to unsatisfactory results lacking precision and coherence. +To enhance the visual accuracy of generated lip movement while reducing the +dependence on labeled data, we propose a novel framework SelfTalk, by involving +self-supervision in a cross-modals network system to learn 3D talking faces. +The framework constructs a network system consisting of three modules: facial +animator, speech recognizer, and lip-reading interpreter. The core of SelfTalk +is a commutative training diagram that facilitates compatible features exchange +among audio, text, and lip shape, enabling our models to learn the intricate +connection between these factors. The proposed framework leverages the +knowledge learned from the lip-reading interpreter to generate more plausible +lip shapes. Extensive experiments and user studies demonstrate that our +proposed approach achieves state-of-the-art performance both qualitatively and +quantitatively. We recommend watching the supplementary video. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Estimating 3D Dental Structures using Simulated Panoramic Radiographs + and Neural Ray Tracing + + +
+ Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality +for dental examination. However, PX only provides a flattened 2D image, lacking +in a 3D view of the oral structure. In this paper, we propose a framework to +estimate 3D oral structures from real-world PX. Our framework tackles full 3D +reconstruction for varying subjects (patients) where each reconstruction is +based only on a single panoramic image. We create an intermediate +representation called simulated PX (SimPX) from 3D Cone-beam computed +tomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and +rotational principles of PX imaging. SimPX aims at not only truthfully +simulating PX, but also facilitates the reverting process back to 3D data. We +propose a novel neural model based on ray tracing which exploits both global +and local input features to convert SimPX to 3D output. At inference, a real PX +image is translated to a SimPX-style image with semantic regularization, and +the translated image is processed by generation module to produce high-quality +outputs. Experiments show that our method outperforms prior state-of-the-art in +reconstruction tasks both quantitatively and qualitatively. Unlike prior +methods, Our method does not require any prior information such as the shape of +dental arches, nor the matched PX-CBCT dataset for training, which is difficult +to obtain in clinical practice. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Pre-trained transformer for adversarial purification + + +
+ With more and more deep neural networks being deployed as various daily +services, their reliability is essential. It's frightening that deep neural +networks are vulnerable and sensitive to adversarial attacks, the most common +one of which for the services is evasion-based. Recent works usually strengthen +the robustness by adversarial training or leveraging the knowledge of an amount +of clean data. However, in practical terms, retraining and redeploying the +model need a large computational budget, leading to heavy losses to the online +service. In addition, when adversarial examples of a certain attack are +detected, only limited adversarial examples are available for the service +provider, while much clean data may not be accessible. Given the mentioned +problems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is +to rapidly defend against a certain attack for the frozen original service +model with limitations of few clean and adversarial examples. Motivated by the +generalization and the universal computation ability of pre-trained transformer +models, we come up with a new defender method, CeTaD, which stands for +Considering Pre-trained Transformers as Defenders. In particular, we evaluate +the effectiveness and the transferability of CeTaD in the case of one-shot +adversarial examples and explore the impact of different parts of CeTaD as well +as training data conditions. CeTaD is flexible, able to be embedded into an +arbitrary differentiable model, and suitable for various types of attacks. + +
+
+
+
+
+ + ♻ ☆ Human Motion Diffusion as a Generative Prior + + +
+ Recent work has demonstrated the significant potential of denoising diffusion +models for generating human motion, including text-to-motion capabilities. +However, these methods are restricted by the paucity of annotated motion data, +a focus on single-person motions, and a lack of detailed control. In this +paper, we introduce three forms of composition based on diffusion priors: +sequential, parallel, and model composition. Using sequential composition, we +tackle the challenge of long sequence generation. We introduce DoubleTake, an +inference-time method with which we generate long animations consisting of +sequences of prompted intervals and their transitions, using a prior trained +only for short clips. Using parallel composition, we show promising steps +toward two-person generation. Beginning with two fixed priors as well as a few +two-person training examples, we learn a slim communication block, ComMDM, to +coordinate interaction between the two resulting motions. Lastly, using model +composition, we first train individual priors to complete motions that realize +a prescribed motion for a given joint. We then introduce DiffusionBlending, an +interpolation mechanism to effectively blend several such models to enable +flexible and efficient fine-grained joint and trajectory-level control and +editing. We evaluate the composition methods using an off-the-shelf motion +diffusion model, and further compare the results to dedicated models trained +for these specific tasks. + +
+
+
+
+
+ + ♻ ☆ FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance + Generation ICCV 2023 + + +
+ Generating full-body and multi-genre dance sequences from given music is a +challenging task, due to the limitations of existing datasets and the inherent +complexity of the fine-grained hand motion and dance genres. To address these +problems, we propose FineDance, which contains 14.6 hours of music-dance paired +data, with fine-grained hand motions, fine-grained genres (22 dance genres), +and accurate posture. To the best of our knowledge, FineDance is the largest +music-dance paired dataset with the most dance genres. Additionally, to address +monotonous and unnatural hand movements existing in previous methods, we +propose a full-body dance generation network, which utilizes the diverse +generation capabilities of the diffusion model to solve monotonous problems, +and use expert nets to solve unreal problems. To further enhance the +genre-matching and long-term stability of generated dances, we propose a +Genre&Coherent aware Retrieval Module. Besides, we propose a novel metric named +Genre Matching Score to evaluate the genre-matching degree between dance and +music. Quantitative and qualitative experiments demonstrate the quality of +FineDance, and the state-of-the-art performance of FineNet. The FineDance +Dataset and more qualitative samples can be found at our website. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ iWarpGAN: Disentangling Identity and Style to Generate Synthetic Iris + Images + + +
+ Generative Adversarial Networks (GANs) have shown success in approximating +complex distributions for synthetic image generation. However, current +GAN-based methods for generating biometric images, such as iris, have certain +limitations: (a) the synthetic images often closely resemble images in the +training dataset; (b) the generated images lack diversity in terms of the +number of unique identities represented in them; and (c) it is difficult to +generate multiple images pertaining to the same identity. To overcome these +issues, we propose iWarpGAN that disentangles identity and style in the context +of the iris modality by using two transformation pathways: Identity +Transformation Pathway to generate unique identities from the training set, and +Style Transformation Pathway to extract the style code from a reference image +and output an iris image using this style. By concatenating the transformed +identity code and reference style code, iWarpGAN generates iris images with +both inter- and intra-class variations. The efficacy of the proposed method in +generating such iris DeepFakes is evaluated both qualitatively and +quantitatively using ISO/IEC 29794-6 Standard Quality Metrics and the VeriEye +iris matcher. Further, the utility of the synthetically generated images is +demonstrated by improving the performance of deep learning based iris matchers +that augment synthetic data with real data during the training process. + +
+
+
+
+
+ + ♻ ☆ EvHandPose: Event-based 3D Hand Pose Estimation with Sparse Supervision + + +
+ Event camera shows great potential in 3D hand pose estimation, especially +addressing the challenges of fast motion and high dynamic range in a low-power +way. However, due to the asynchronous differential imaging mechanism, it is +challenging to design event representation to encode hand motion information +especially when the hands are not moving (causing motion ambiguity), and it is +infeasible to fully annotate the temporally dense event stream. In this paper, +we propose EvHandPose with novel hand flow representations in Event-to-Pose +module for accurate hand pose estimation and alleviating the motion ambiguity +issue. To solve the problem under sparse annotation, we design contrast +maximization and hand-edge constraints in Pose-to-IWE (Image with Warped +Events) module and formulate EvHandPose in a weakly-supervision framework. We +further build EvRealHands, the first large-scale real-world event-based hand +pose dataset on several challenging scenes to bridge the real-synthetic domain +gap. Experiments on EvRealHands demonstrate that EvHandPose outperforms +previous event-based methods under all evaluation scenes, achieves accurate and +stable hand pose estimation with high temporal resolution in fast motion and +strong light scenes compared with RGB-based methods, generalizes well to +outdoor scenes and another type of event camera, and shows the potential for +the hand gesture recognition task. + +
+
+
+
+
+ + ♻ ☆ WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant + Analysis ICCV 2023 + + +
+ Deep neural networks are susceptible to generating overconfident yet +erroneous predictions when presented with data beyond known concepts. This +challenge underscores the importance of detecting out-of-distribution (OOD) +samples in the open world. In this work, we propose a novel feature-space OOD +detection score based on class-specific and class-agnostic information. +Specifically, the approach utilizes Whitened Linear Discriminant Analysis to +project features into two subspaces - the discriminative and residual subspaces +- for which the in-distribution (ID) classes are maximally separated and +closely clustered, respectively. The OOD score is then determined by combining +the deviation from the input data to the ID pattern in both subspaces. The +efficacy of our method, named WDiscOOD, is verified on the large-scale +ImageNet-1k benchmark, with six OOD datasets that cover a variety of +distribution shifts. WDiscOOD demonstrates superior performance on deep +classifiers with diverse backbone architectures, including CNN and vision +transformer. Furthermore, we also show that WDiscOOD more effectively detects +novel concepts in representation spaces trained with contrastive objectives, +including supervised contrastive loss and multi-modality contrastive loss. + +
+
+ comment: Accepted by ICCV 2023. Code is available at: + https://github.com/ivalab/WDiscOOD.git +
+
+
+
+
+ + ♻ ☆ Generalized Universal Domain Adaptation with Generative Flow Networks + + +
+ We introduce a new problem in unsupervised domain adaptation, termed as +Generalized Universal Domain Adaptation (GUDA), which aims to achieve precise +prediction of all target labels including unknown categories. GUDA bridges the +gap between label distribution shift-based and label space mismatch-based +variants, essentially categorizing them as a unified problem, guiding to a +comprehensive framework for thoroughly solving all the variants. The key +challenge of GUDA is developing and identifying novel target categories while +estimating the target label distribution. To address this problem, we take +advantage of the powerful exploration capability of generative flow networks +and propose an active domain adaptation algorithm named GFlowDA, which selects +diverse samples with probabilities proportional to a reward function. To +enhance the exploration capability and effectively perceive the target label +distribution, we tailor the states and rewards, and introduce an efficient +solution for parent exploration and state transition. We also propose a +training paradigm for GUDA called Generalized Universal Adversarial Network +(GUAN), which involves collaborative optimization between GUAN and GFlowNet. +Theoretical analysis highlights the importance of exploration, and extensive +experiments on benchmark datasets demonstrate the superiority of GFlowDA. + +
+
+
+
+
+ + ♻ ☆ Universal Domain Adaptation via Compressive Attention Matching + + +
+ Universal domain adaptation (UniDA) aims to transfer knowledge from the +source domain to the target domain without any prior knowledge about the label +set. The challenge lies in how to determine whether the target samples belong +to common categories. The mainstream methods make judgments based on the sample +features, which overemphasizes global information while ignoring the most +crucial local objects in the image, resulting in limited accuracy. To address +this issue, we propose a Universal Attention Matching (UniAM) framework by +exploiting the self-attention mechanism in vision transformer to capture the +crucial object information. The proposed framework introduces a novel +Compressive Attention Matching (CAM) approach to explore the core information +by compressively representing attentions. Furthermore, CAM incorporates a +residual-based measurement to determine the sample commonness. By utilizing the +measurement, UniAM achieves domain-wise and category-wise Common Feature +Alignment (CFA) and Target Class Separation (TCS). Notably, UniAM is the first +method utilizing the attention in vision transformer directly to perform +classification tasks. Extensive experiments show that UniAM outperforms the +current state-of-the-art methods on various benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ Exploring the Relationship between Samples and Masks for Robust Defect + Localization + + +
+ Defect detection aims to detect and localize regions out of the normal +distribution.Previous approaches model normality and compare it with the input +to identify defective regions, potentially limiting their generalizability.This +paper proposes a one-stage framework that detects defective patterns directly +without the modeling process.This ability is adopted through the joint efforts +of three parties: a generative adversarial network (GAN), a newly proposed +scaled pattern loss, and a dynamic masked cycle-consistent auxiliary network. +Explicit information that could indicate the position of defects is +intentionally excluded to avoid learning any direct mapping.Experimental +results on the texture class of the challenging MVTec AD dataset show that the +proposed method is 2.9\% higher than the SOTA methods in F1-Score, while +substantially outperforming SOTA methods in generalizability. + +
+
+
+
+
+ + ♻ ☆ UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D + Representation for 3D Perception in Autonomous Driving + + +
+ Masked Autoencoders (MAE) play a pivotal role in learning potent +representations, delivering outstanding results across various 3D perception +tasks essential for autonomous driving. In real-world driving scenarios, it's +commonplace to deploy multiple sensors for comprehensive environment +perception. While integrating multi-modal features from these sensors can +produce rich and powerful features, there is a noticeable gap in MAE methods +addressing this integration. This research delves into multi-modal Masked +Autoencoders tailored for a unified representation space in autonomous driving, +aiming to pioneer a more efficient fusion of two distinct modalities. To +intricately marry the semantics inherent in images with the geometric +intricacies of LiDAR point clouds, the UniM$^2$AE is proposed. This model +stands as a potent yet straightforward, multi-modal self-supervised +pre-training framework, mainly consisting of two designs. First, it projects +the features from both modalities into a cohesive 3D volume space, ingeniously +expanded from the bird's eye view (BEV) to include the height dimension. The +extension makes it possible to back-project the informative features, obtained +by fusing features from both modalities, into their native modalities to +reconstruct the multiple masked inputs. Second, the Multi-modal 3D Interactive +Module (MMIM) is invoked to facilitate the efficient inter-modal interaction +during the interaction process. Extensive experiments conducted on the nuScenes +Dataset attest to the efficacy of UniM$^2$AE, indicating enhancements in 3D +object detection and BEV map segmentation by 1.2\%(NDS) and 6.5\% (mIoU), +respectively. Code is available at https://github.com/hollow-503/UniM2AE. + +
+
+ comment: Code available at https://github.com/hollow-503/UniM2AE +
+
+
+
+
+ + ♻ ☆ EA-LSS: Edge-aware Lift-splat-shot Framework for 3D BEV Object Detection + + +
+ In recent years, great progress has been made in the Lift-Splat-Shot-based +(LSS-based) 3D object detection method. However, inaccurate depth estimation +remains an important constraint to the accuracy of camera-only and multi-model +3D object detection models, especially in regions where the depth changes +significantly (i.e., the "depth jump" problem). In this paper, we proposed a +novel Edge-aware Lift-splat-shot (EA-LSS) framework. Specifically, edge-aware +depth fusion (EADF) module is proposed to alleviate the "depth jump" problem +and fine-grained depth (FGD) module to further enforce refined supervision on +depth. Our EA-LSS framework is compatible for any LSS-based 3D object detection +models, and effectively boosts their performances with negligible increment of +inference time. Experiments on nuScenes benchmarks demonstrate that EA-LSS is +effective in either camera-only or multi-model models. It is worth mentioning +that EA-LSS achieved the state-of-the-art performance on nuScenes test +benchmarks with mAP and NDS of 76.5% and 77.6%, respectively. + +
+
+
+
+
+ + ♻ ☆ Few-Shot Object Detection via Synthetic Features with Optimal Transport + + +
+ Few-shot object detection aims to simultaneously localize and classify the +objects in an image with limited training samples. However, most existing +few-shot object detection methods focus on extracting the features of a few +samples of novel classes that lack diversity. Hence, they may not be sufficient +to capture the data distribution. To address that limitation, in this paper, we +propose a novel approach in which we train a generator to generate synthetic +data for novel classes. Still, directly training a generator on the novel class +is not effective due to the lack of novel data. To overcome that issue, we +leverage the large-scale dataset of base classes. Our overarching goal is to +train a generator that captures the data variations of the base dataset. We +then transform the captured variations into novel classes by generating +synthetic data with the trained generator. To encourage the generator to +capture data variations on base classes, we propose to train the generator with +an optimal transport loss that minimizes the optimal transport distance between +the distributions of real and synthetic data. Extensive experiments on two +benchmark datasets demonstrate that the proposed method outperforms the state +of the art. Source code will be available. + +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ CamP: Camera Preconditioning for Neural Radiance Fields SIGGRAPH + + +
+ Neural Radiance Fields (NeRF) can be optimized to obtain high-fidelity 3D +scene reconstructions of objects and large-scale scenes. However, NeRFs require +accurate camera parameters as input -- inaccurate camera parameters result in +blurry renderings. Extrinsic and intrinsic camera parameters are usually +estimated using Structure-from-Motion (SfM) methods as a pre-processing step to +NeRF, but these techniques rarely yield perfect estimates. Thus, prior works +have proposed jointly optimizing camera parameters alongside a NeRF, but these +methods are prone to local minima in challenging settings. In this work, we +analyze how different camera parameterizations affect this joint optimization +problem, and observe that standard parameterizations exhibit large differences +in magnitude with respect to small perturbations, which can lead to an +ill-conditioned optimization problem. We propose using a proxy problem to +compute a whitening transform that eliminates the correlation between camera +parameters and normalizes their effects, and we propose to use this transform +as a preconditioner for the camera parameters during joint optimization. Our +preconditioned camera optimization significantly improves reconstruction +quality on scenes from the Mip-NeRF 360 dataset: we reduce error rates (RMSE) +by 67% compared to state-of-the-art NeRF approaches that do not optimize for +cameras like Zip-NeRF, and by 29% relative to state-of-the-art joint +optimization approaches using the camera parameterization of SCNeRF. Our +approach is easy to implement, does not significantly increase runtime, can be +applied to a wide variety of camera parameterizations, and can +straightforwardly be incorporated into other NeRF-like models. + +
+
+ comment: SIGGRAPH Asia 2023, Project page: https://camp-nerf.github.io +
+
+
+
+
+ + ♻ ☆ Learning Temporal Distribution and Spatial Correlation for Universal + Moving Object Segmentation + + +
+ Universal moving object segmentation aims to provide a general model for +videos from all types of natural scenes, as previous approaches are usually +effective for specific or similar scenes. In this paper, we propose a method +called Learning Temporal Distribution and Spatial Correlation (LTS) that has +the potential to be a general solution for universal moving object +segmentation. In the proposed approach, the distribution from temporal pixels +is first learned by our Defect Iterative Distribution Learning (DIDL) network +for a scene-independent segmentation. Then, the Stochastic Bayesian Refinement +(SBR) Network, which learns the spatial correlation, is proposed to improve the +binary mask generated by the DIDL network. Benefiting from the scene +independence of the temporal distribution and the accuracy improvement +resulting from the spatial correlation, the proposed approach performs well for +almost all videos from diverse and complex natural scenes with fixed +parameters. Comprehensive experiments on standard datasets including LASIESTA, +CDNet2014, BMC, SBMI2015 and 128 real world videos demonstrate the superiority +of proposed approach compared to state-of-the-art methods with or without the +use of deep learning networks. To the best of our knowledge, this work has high +potential to be a general solution for moving object segmentation in real world +environments. + +
+
+
+
+
+ + ♻ ☆ Beyond NeRF Underwater: Learning Neural Reflectance Fields for True + Color Correction of Marine Imagery + + +
+ Underwater imagery often exhibits distorted coloration as a result of +light-water interactions, which complicates the study of benthic environments +in marine biology and geography. In this research, we propose an algorithm to +restore the true color (albedo) in underwater imagery by jointly learning the +effects of the medium and neural scene representations. Our approach models +water effects as a combination of light attenuation with distance and +backscattered light. The proposed neural scene representation is based on a +neural reflectance field model, which learns albedos, normals, and volume +densities of the underwater environment. We introduce a logistic regression +model to separate water from the scene and apply distinct light physics during +training. Our method avoids the need to estimate complex backscatter effects in +water by employing several approximations, enhancing sampling efficiency and +numerical stability during training. The proposed technique integrates +underwater light effects into a volume rendering framework with end-to-end +differentiability. Experimental results on both synthetic and real-world data +demonstrate that our method effectively restores true color from underwater +imagery, outperforming existing approaches in terms of color consistency. + +
+
+ comment: Robotics and Automation Letters (RA-L) VOL. 8, NO. 10, OCTOBER 2023 +
+
+
+
+
+ + ♻ ☆ Collecting The Puzzle Pieces: Disentangled Self-Driven Human Pose + Transfer by Permuting Textures ICCV 2023 + + +
+ Human pose transfer synthesizes new view(s) of a person for a given pose. +Recent work achieves this via self-reconstruction, which disentangles a +person's pose and texture information by breaking the person down into parts, +then recombines them for reconstruction. However, part-level disentanglement +preserves some pose information that can create unwanted artifacts. In this +paper, we propose Pose Transfer by Permuting Textures (PT$^2$), an approach for +self-driven human pose transfer that disentangles pose from texture at the +patch-level. Specifically, we remove pose from an input image by permuting +image patches so only texture information remains. Then we reconstruct the +input image by sampling from the permuted textures for patch-level +disentanglement. To reduce noise and recover clothing shape information from +the permuted patches, we employ encoders with multiple kernel sizes in a triple +branch network. On DeepFashion and Market-1501, PT$^2$ reports significant +gains on automatic metrics over other self-driven methods, and even outperforms +some fully-supervised methods. A user study also reports images generated by +our method are preferred in 68% of cases over self-driven approaches from prior +work. Code is available at https://github.com/NannanLi999/pt_square. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SurgT challenge: Benchmark of Soft-Tissue Trackers for Robotic Surgery + + +
+ This paper introduces the ``SurgT: Surgical Tracking" challenge which was +organised in conjunction with MICCAI 2022. There were two purposes for the +creation of this challenge: (1) the establishment of the first standardised +benchmark for the research community to assess soft-tissue trackers; and (2) to +encourage the development of unsupervised deep learning methods, given the lack +of annotated data in surgery. A dataset of 157 stereo endoscopic videos from 20 +clinical cases, along with stereo camera calibration parameters, have been +provided. Participants were assigned the task of developing algorithms to track +the movement of soft tissues, represented by bounding boxes, in stereo +endoscopic videos. At the end of the challenge, the developed methods were +assessed on a previously hidden test subset. This assessment uses benchmarking +metrics that were purposely developed for this challenge, to verify the +efficacy of unsupervised deep learning algorithms in tracking soft-tissue. The +metric used for ranking the methods was the Expected Average Overlap (EAO) +score, which measures the average overlap between a tracker's and the ground +truth bounding boxes. Coming first in the challenge was the deep learning +submission by ICVS-2Ai with a superior EAO score of 0.617. This method employs +ARFlow to estimate unsupervised dense optical flow from cropped images, using +photometric and regularization losses. Second, Jmees with an EAO of 0.583, uses +deep learning for surgical tool segmentation on top of a non-deep learning +baseline method: CSRT. CSRT by itself scores a similar EAO of 0.563. The +results from this challenge show that currently, non-deep learning methods are +still competitive. The dataset and benchmarking tool created for this challenge +have been made publicly available at https://surgt.grand-challenge.org/. + +
+
+
+
+
+ + ♻ ☆ Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models ICCV 2023 + + +
+ Despite tremendous progress in generating high-quality images using diffusion +models, synthesizing a sequence of animated frames that are both photorealistic +and temporally coherent is still in its infancy. While off-the-shelf +billion-scale datasets for image generation are available, collecting similar +video data of the same scale is still challenging. Also, training a video +diffusion model is computationally much more expensive than its image +counterpart. In this work, we explore finetuning a pretrained image diffusion +model with video data as a practical solution for the video synthesis task. We +find that naively extending the image noise prior to video noise prior in video +diffusion leads to sub-optimal performance. Our carefully designed video noise +prior leads to substantially better performance. Extensive experimental +validation shows that our model, Preserve Your Own Correlation (PYoCo), attains +SOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It +also achieves SOTA video generation quality on the small-scale UCF-101 +benchmark with a $10\times$ smaller model using significantly less computation +than the prior art. + +
+
+ comment: ICCV 2023. Project webpage: + https://research.nvidia.com/labs/dir/pyoco +
+
+
+
+
+ + ♻ ☆ DR.CPO: Diversified and Realistic 3D Augmentation via Iterative + Construction, Random Placement, and HPR Occlusion + + +
+ In autonomous driving, data augmentation is commonly used for improving 3D +object detection. The most basic methods include insertion of copied objects +and rotation and scaling of the entire training frame. Numerous variants have +been developed as well. The existing methods, however, are considerably limited +when compared to the variety of the real world possibilities. In this work, we +develop a diversified and realistic augmentation method that can flexibly +construct a whole-body object, freely locate and rotate the object, and apply +self-occlusion and external-occlusion accordingly. To improve the diversity of +the whole-body object construction, we develop an iterative method that +stochastically combines multiple objects observed from the real world into a +single object. Unlike the existing augmentation methods, the constructed +objects can be randomly located and rotated in the training frame because +proper occlusions can be reflected to the whole-body objects in the final step. +Finally, proper self-occlusion at each local object level and +external-occlusion at the global frame level are applied using the Hidden Point +Removal (HPR) algorithm that is computationally efficient. HPR is also used for +adaptively controlling the point density of each object according to the +object's distance from the LiDAR. Experiment results show that the proposed +DR.CPO algorithm is data-efficient and model-agnostic without incurring any +computational overhead. Also, DR.CPO can improve mAP performance by 2.08% when +compared to the best 3D detection result known for KITTI dataset. The code is +available at https://github.com/SNU-DRL/DRCPO.git + +
+
+
+
+
+ + ♻ ☆ Expressive Text-to-Image Generation with Rich Text ICCV 2023 + + +
+ Plain text has become a prevalent interface for text-to-image synthesis. +However, its limited customization options hinder users from accurately +describing desired outputs. For example, plain text makes it hard to specify +continuous quantities, such as the precise RGB color value or importance of +each word. Furthermore, creating detailed text prompts for complex scenes is +tedious for humans to write and challenging for text encoders to interpret. To +address these challenges, we propose using a rich-text editor supporting +formats such as font style, size, color, and footnote. We extract each word's +attributes from rich text to enable local style control, explicit token +reweighting, precise color rendering, and detailed region synthesis. We achieve +these capabilities through a region-based diffusion process. We first obtain +each word's region based on attention maps of a diffusion process using plain +text. For each region, we enforce its text attributes by creating +region-specific detailed prompts and applying region-specific guidance, and +maintain its fidelity against plain-text generation through region-based +injections. We present various examples of image generation from rich text and +demonstrate that our method outperforms strong baselines with quantitative +evaluations. + +
+
+ comment: ICCV 2023. Project webpage: https://rich-text-to-image.github.io/ +
+
+
+
+
+ + ♻ ☆ From Chaos Comes Order: Ordering Event Representations for Object + Recognition and Detection ICCV 2023 + + +
+ Today, state-of-the-art deep neural networks that process events first +convert them into dense, grid-like input representations before using an +off-the-shelf network. However, selecting the appropriate representation for +the task traditionally requires training a neural network for each +representation and selecting the best one based on the validation score, which +is very time-consuming. This work eliminates this bottleneck by selecting +representations based on the Gromov-Wasserstein Discrepancy (GWD) between raw +events and their representation. It is about 200 times faster to compute than +training a neural network and preserves the task performance ranking of event +representations across multiple representations, network backbones, datasets, +and tasks. Thus finding representations with high task scores is equivalent to +finding representations with a low GWD. We use this insight to, for the first +time, perform a hyperparameter search on a large family of event +representations, revealing new and powerful representations that exceed the +state-of-the-art. Our optimized representations outperform existing +representations by 1.7 mAP on the 1 Mpx dataset and 0.3 mAP on the Gen1 +dataset, two established object detection benchmarks, and reach a 3.8% higher +classification score on the mini N-ImageNet benchmark. Moreover, we outperform +state-of-the-art by 2.1 mAP on Gen1 and state-of-the-art feed-forward methods +by 6.0 mAP on the 1 Mpx datasets. This work opens a new unexplored field of +explicit representation optimization for event-based learning. + +
+
+ comment: 15 pages, 11 figures, 2 tables, ICCV 2023 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Learning Melanocytic Cell Masks from Adjacent Stained Tissue MICCAI + + +
+ Melanoma is one of the most aggressive forms of skin cancer, causing a large +proportion of skin cancer deaths. However, melanoma diagnoses by pathologists +shows low interrater reliability. As melanoma is a cancer of the melanocyte, +there is a clear need to develop a melanocytic cell segmentation tool that is +agnostic to pathologist variability and automates pixel-level annotation. +Gigapixel-level pathologist labeling, however, is impractical. Herein, we +propose a means to train deep neural networks for melanocytic cell segmentation +from hematoxylin and eosin (H&E) stained sections and paired +immunohistochemistry (IHC) of adjacent tissue sections, achieving a mean IOU of +0.64 despite imperfect ground-truth labels. + +
+
+ comment: Accepted at Medical Image Learning with Limited & Noisy Data + Workshop, Medical Image Computing and Computer Assisted Interventions + (MICCAI) 2022 +
+
+
+
+
+ + ♻ ☆ DALL-Eval: Probing the Reasoning Skills and Social Biases of + Text-to-Image Generation Models ICCV 2023 + + +
+ Recently, DALL-E, a multimodal transformer language model, and its variants, +including diffusion models, have shown high-quality text-to-image generation +capabilities. However, despite the realistic image generation results, there +has not been a detailed analysis of how to evaluate such models. In this work, +we investigate the visual reasoning capabilities and social biases of different +text-to-image models, covering both multimodal transformer language models and +diffusion models. First, we measure three visual reasoning skills: object +recognition, object counting, and spatial relation understanding. For this, we +propose PaintSkills, a compositional diagnostic evaluation dataset that +measures these skills. Despite the high-fidelity image generation capability, a +large gap exists between the performance of recent models and the upper bound +accuracy in object counting and spatial relation understanding skills. Second, +we assess the gender and skin tone biases by measuring the gender/skin tone +distribution of generated images across various professions and attributes. We +demonstrate that recent text-to-image generation models learn specific biases +about gender and skin tone from web image-text pairs. We hope our work will +help guide future progress in improving text-to-image generation models on +visual reasoning skills and learning socially unbiased representations. Code +and data: https://github.com/j-min/DallEval + +
+
+ comment: ICCV 2023 (34 pages; see appendix for version changelog) +
+
+
+
+
+ + ♻ ☆ High-Perceptual Quality JPEG Decoding via Posterior Sampling CVPR 2023 + + +
+ JPEG is arguably the most popular image coding format, achieving high +compression ratios via lossy quantization that may create visual artifacts +degradation. Numerous attempts to remove these artifacts were conceived over +the years, and common to most of these is the use of deterministic +post-processing algorithms that optimize some distortion measure (e.g., PSNR, +SSIM). In this paper we propose a different paradigm for JPEG artifact +correction: Our method is stochastic, and the objective we target is high +perceptual quality -- striving to obtain sharp, detailed and visually pleasing +reconstructed images, while being consistent with the compressed input. These +goals are achieved by training a stochastic conditional generator (conditioned +on the compressed input), accompanied by a theoretically well-founded loss +term, resulting in a sampler from the posterior distribution. Our solution +offers a diverse set of plausible and fast reconstructions for a given input +with perfect consistency. We demonstrate our scheme's unique properties and its +superiority to a variety of alternative methods on the FFHQ and ImageNet +datasets. + +
+
+ comment: Presented in NTIRE workshop as part of CVPR 2023 +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems CIKM'2023 + + +
+ In sequential recommendation, multi-modal information (e.g., text or image) +can provide a more comprehensive view of an item's profile. The optimal stage +(early or late) to fuse modality features into item representations is still +debated. We propose a graph-based approach (named MMSR) to fuse modality +features in an adaptive order, enabling each modality to prioritize either its +inherent sequential nature or its interplay with other modalities. MMSR +represents each user's history as a graph, where the modality features of each +item in a user's history sequence are denoted by cross-linked nodes. The edges +between homogeneous nodes represent intra-modality sequential relationships, +and the ones between heterogeneous nodes represent inter-modality +interdependence relationships. During graph propagation, MMSR incorporates dual +attention, differentiating homogeneous and heterogeneous neighbors. To +adaptively assign nodes with distinct fusion orders, MMSR allows each node's +representation to be asynchronously updated through an update gate. In +scenarios where modalities exhibit stronger sequential relationships, the +update gate prioritizes updates among homogeneous nodes. Conversely, when the +interdependent relationships between modalities are more pronounced, the update +gate prioritizes updates among heterogeneous nodes. Consequently, MMSR +establishes a fusion order that spans a spectrum from early to late modality +fusion. In experiments across six datasets, MMSR consistently outperforms +state-of-the-art models, and our graph propagation methods surpass other graph +neural networks. Additionally, MMSR naturally manages missing modalities. + +
+
+ comment: CIKM'2023 +
+
+
+
+
+ + ☆ Denoising Attention for Query-aware User Modeling in Personalized Search + + +
+ The personalization of search results has gained increasing attention in the +past few years, thanks to the development of Neural Networks-based approaches +for Information Retrieval and the importance of personalization in many search +scenarios. Recent works have proposed to build user models at query time by +leveraging the Attention mechanism, which allows weighing the contribution of +the user-related information w.r.t. the current query. This approach allows +taking into account the diversity of the user's interests by giving more +importance to those related to the current search performed by the user. + In this paper, we first discuss some shortcomings of the standard Attention +formulation when employed for personalization. In particular, we focus on +issues related to its normalization mechanism and its inability to entirely +filter out noisy user-related information. Then, we introduce the Denoising +Attention mechanism: an Attention variant that directly tackles the above +shortcomings by adopting a robust normalization scheme and introducing a +filtering mechanism. The reported experimental evaluation shows the benefits of +the proposed approach over other Attention-based variants. + +
+
+
+
+
+ + ☆ DRGame: Diversified Recommendation for Multi-category Video Games with + Balanced Implicit Preferences + + +
+ The growing popularity of subscription services in video game consumption has +emphasized the importance of offering diversified recommendations. Providing +users with a diverse range of games is essential for ensuring continued +engagement and fostering long-term subscriptions. However, existing +recommendation models face challenges in effectively handling highly imbalanced +implicit feedback in gaming interactions. Additionally, they struggle to take +into account the distinctive characteristics of multiple categories and the +latent user interests associated with these categories. In response to these +challenges, we propose a novel framework, named DRGame, to obtain diversified +recommendation. It is centered on multi-category video games, consisting of two +{components}: Balance-driven Implicit Preferences Learning for data +pre-processing and Clustering-based Diversified Recommendation {Module} for +final prediction. The first module aims to achieve a balanced representation of +implicit feedback in game time, thereby discovering a comprehensive view of +player interests across different categories. The second module adopts +category-aware representation learning to cluster and select players and games +based on balanced implicit preferences, and then employs asymmetric neighbor +aggregation to achieve diversified recommendations. Experimental results on a +real-world dataset demonstrate the superiority of our proposed method over +existing approaches in terms of game diversity recommendations. + +
+
+
+
+
+ + ☆ Knowledge-grounded Natural Language Recommendation Explanation + + +
+ Explanations accompanied by a recommendation can assist users in +understanding the decision made by recommendation systems, which in turn +increases a user's confidence and trust in the system. Recently, research has +focused on generating natural language explanations in a human-readable format. +Thus far, the proposed approaches leverage item reviews written by users, which +are often subjective, sparse in language, and unable to account for new items +that have not been purchased or reviewed before. Instead, we aim to generate +fact-grounded recommendation explanations that are objectively described with +item features while implicitly considering a user's preferences, based on the +user's purchase history. To achieve this, we propose a knowledge graph (KG) +approach to natural language explainable recommendation. Our approach draws on +user-item features through a novel collaborative filtering-based KG +representation to produce fact-grounded, personalized explanations, while +jointly learning user-item representations for recommendation scoring. +Experimental results show that our approach consistently outperforms previous +state-of-the-art models on natural language explainable recommendation. + +
+
+
+
+
+ + ☆ Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling + Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate + Prediction CIKM 2023 + + +
+ Spatial-temporal information has been proven to be of great significance for +click-through rate prediction tasks in online Location-Based Services (LBS), +especially in mainstream food ordering platforms such as DoorDash, Uber Eats, +Meituan, and Ele.me. Modeling user spatial-temporal preferences with sequential +behavior data has become a hot topic in recommendation systems and online +advertising. However, most of existing methods either lack the representation +of rich spatial-temporal information or only handle user behaviors with limited +length, e.g. 100. In this paper, we tackle these problems by designing a new +spatial-temporal modeling paradigm named Fragment and Integrate Network (FIN). +FIN consists of two networks: (i) Fragment Network (FN) extracts Multiple +Sub-Sequences (MSS) from lifelong sequential behavior data, and captures the +specific spatial-temporal representation by modeling each MSS respectively. +Here both a simplified attention and a complicated attention are adopted to +balance the performance gain and resource consumption. (ii) Integrate Network +(IN) builds a new integrated sequence by utilizing spatial-temporal interaction +on MSS and captures the comprehensive spatial-temporal representation by +modeling the integrated sequence with a complicated attention. Both public +datasets and production datasets have demonstrated the accuracy and scalability +of FIN. Since 2022, FIN has been fully deployed in the recommendation +advertising system of Ele.me, one of the most popular online food ordering +platforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and +7.3% increase on Revenue Per Mille (RPM). + +
+
+ comment: Accepted by CIKM 2023 Applied Research Paper +
+
+
+
+
+ + ☆ A Survey on Multi-Behavior Sequential Recommendation + + +
+ Recommender systems is set up to address the issue of information overload in +traditional information retrieval systems, which is focused on recommending +information that is of most interest to users from massive information. +Generally, there is a sequential nature and heterogeneity to the behavior of a +person interacting with a system, leading to the proposal of multi-behavior +sequential recommendation (MBSR). MBSR is a relatively new and worthy direction +for in-depth research, which can achieve state-of-the-art recommendation +through suitable modeling, and some related works have been proposed. This +survey aims to shed light on the MBSR problem. Firstly, we introduce MBSR in +detail, including its problem definition, application scenarios and challenges +faced. Secondly, we detail the classification of MBSR, including +neighborhood-based methods, matrix factorization-based methods and deep +learning-based methods, where we further classify the deep learning-based +methods into different learning architectures based on RNN, GNN, Transformer, +and generic architectures as well as architectures that integrate hybrid +techniques. In each method, we present related works based on the data +perspective and the modeling perspective, as well as analyze the strengths, +weaknesses and features of these works. Finally, we discuss some promising +future research directions to address the challenges and improve the current +status of MBSR. + +
+
+
+
+
+ + ♻ ☆ On the Consistency of Average Embeddings for Item Recommendation RecSys 2023 + + +
+ A prevalent practice in recommender systems consists in averaging item +embeddings to represent users or higher-level concepts in the same embedding +space. This paper investigates the relevance of such a practice. For this +purpose, we propose an expected precision score, designed to measure the +consistency of an average embedding relative to the items used for its +construction. We subsequently analyze the mathematical expression of this score +in a theoretical setting with specific assumptions, as well as its empirical +behavior on real-world data from music streaming services. Our results +emphasize that real-world averages are less consistent for recommendation, +which paves the way for future research to better align real-world embeddings +with assumptions from our theoretical setting. + +
+
+ comment: 17th ACM Conference on Recommender Systems (RecSys 2023) +
+
+
+
+
+ + ♻ ☆ Large Language Models are not Fair Evaluators + + +
+ In this paper, we uncover a systematic bias in the evaluation paradigm of +adopting large language models~(LLMs), e.g., GPT-4, as a referee to score and +compare the quality of responses generated by candidate models. We find that +the quality ranking of candidate responses can be easily hacked by simply +altering their order of appearance in the context. This manipulation allows us +to skew the evaluation result, making one model appear considerably superior to +the other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries +with ChatGPT as an evaluator. To address this issue, we propose a calibration +framework with three simple yet effective strategies: 1) Multiple Evidence +Calibration, which requires the evaluator model to generate multiple evaluation +evidence before assigning ratings; 2) Balanced Position Calibration, which +aggregates results across various orders to determine the final score; 3) +Human-in-the-Loop Calibration, which introduces a balanced position diversity +entropy to measure the difficulty of each example and seeks human assistance +when needed. We also manually annotate the "win/tie/lose" outcomes of responses +from ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and +extensive experiments demonstrate that our approach successfully mitigates +evaluation bias, resulting in closer alignment with human judgments. We release +our code and human annotation at \url{https://github.com/i-Eval/FairEval} to +facilitate future research. + +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Voucher Abuse Detection with Prompt-based Fine-tuning on Graph Neural + Networks CIKM23 + + +
+ Voucher abuse detection is an important anomaly detection problem in +E-commerce. While many GNN-based solutions have emerged, the supervised +paradigm depends on a large quantity of labeled data. A popular alternative is +to adopt self-supervised pre-training using label-free data, and further +fine-tune on a downstream task with limited labels. Nevertheless, the +"pre-train, fine-tune" paradigm is often plagued by the objective gap between +pre-training and downstream tasks. Hence, we propose VPGNN, a prompt-based +fine-tuning framework on GNNs for voucher abuse detection. We design a novel +graph prompting function to reformulate the downstream task into a similar +template as the pretext task in pre-training, thereby narrowing the objective +gap. Extensive experiments on both proprietary and public datasets demonstrate +the strength of VPGNN in both few-shot and semi-supervised scenarios. Moreover, +an online deployment of VPGNN in a production environment shows a 23.4% +improvement over two existing deployed models. + +
+
+ comment: 7 pages, Accepted by CIKM23 Applied Research Track +
+
+
+
+
+
+
+
+ + Machine Learning 124 + +
+
+
+ + ☆ Algebraic, Topological, and Mereological Foundations of Existential + Granules + + +
+ In this research, new concepts of existential granules that determine +themselves are invented, and are characterized from algebraic, topological, and +mereological perspectives. Existential granules are those that determine +themselves initially, and interact with their environment subsequently. +Examples of the concept, such as those of granular balls, though inadequately +defined, algorithmically established, and insufficiently theorized in earlier +works by others, are already used in applications of rough sets and soft +computing. It is shown that they fit into multiple theoretical frameworks +(axiomatic, adaptive, and others) of granular computing. The characterization +is intended for algorithm development, application to classification problems +and possible mathematical foundations of generalizations of the approach. +Additionally, many open problems are posed and directions provided. + +
+
+ comment: 15 Pages +
+
+
+
+
+ + ☆ Modality Cycles with Masked Conditional Diffusion for Unsupervised + Anomaly Segmentation in MRI MICCAI + 2023 + + +
+ Unsupervised anomaly segmentation aims to detect patterns that are distinct +from any patterns processed during training, commonly called abnormal or +out-of-distribution patterns, without providing any associated manual +segmentations. Since anomalies during deployment can lead to model failure, +detecting the anomaly can enhance the reliability of models, which is valuable +in high-risk domains like medical imaging. This paper introduces Masked +Modality Cycles with Conditional Diffusion (MMCCD), a method that enables +segmentation of anomalies across diverse patterns in multimodal MRI. The method +is based on two fundamental ideas. First, we propose the use of cyclic modality +translation as a mechanism for enabling abnormality detection. +Image-translation models learn tissue-specific modality mappings, which are +characteristic of tissue physiology. Thus, these learned mappings fail to +translate tissues or image patterns that have never been encountered during +training, and the error enables their segmentation. Furthermore, we combine +image translation with a masked conditional diffusion model, which attempts to +`imagine' what tissue exists under a masked area, further exposing unknown +patterns as the generative model fails to recreate them. We evaluate our method +on a proxy task by training on healthy-looking slices of BraTS2021 +multi-modality MRIs and testing on slices with tumors. We show that our method +compares favorably to previous unsupervised approaches based on image +reconstruction and denoising with autoencoders and diffusion models. + +
+
+ comment: Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI + 2023 +
+
+
+
+
+ + ☆ Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open + Generative Large Language Models + + +
+ We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric +foundation and instruction-tuned open generative large language models (LLMs). +The models are based on the GPT-3 decoder-only architecture and are pretrained +on a mixture of Arabic and English texts, including source code in various +programming languages. With 13 billion parameters, they demonstrate better +knowledge and reasoning capabilities in Arabic than any existing open Arabic +and multilingual models by a sizable margin, based on extensive evaluation. +Moreover, the models are competitive in English compared to English-centric +open models of similar size, despite being trained on much less English data. +We provide a detailed description of the training, the tuning, the safety +alignment, and the evaluation of the models. We release two open versions of +the model -- the foundation Jais model, and an instruction-tuned Jais-chat +variant -- with the aim of promoting research on Arabic LLMs. Available at +https://huggingface.co/inception-mbzuai/jais-13b-chat + +
+
+ comment: Arabic-centric, foundation model, large-language model, LLM, + generative model, instruction-tuned, Jais, Jais-chat +
+
+
+
+
+ + ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Spatial Graph Coarsening: Weather and Weekday Prediction with London's + Bike-Sharing Service using GNN + + +
+ This study introduced the use of Graph Neural Network (GNN) for predicting +the weather and weekday of a day in London, from the dataset of Santander +Cycles bike-sharing system as a graph classification task. The proposed GNN +models newly introduced (i) a concatenation operator of graph features with +trained node embeddings and (ii) a graph coarsening operator based on +geographical contiguity, namely "Spatial Graph Coarsening". With the node +features of land-use characteristics and number of households around the bike +stations and graph features of temperatures in the city, our proposed models +outperformed the baseline model in cross-entropy loss and accuracy of the +validation dataset. + +
+
+
+
+
+ + ☆ survex: an R package for explaining machine learning survival models + + +
+ Due to their flexibility and superior performance, machine learning models +frequently complement and outperform traditional statistical survival models. +However, their widespread adoption is hindered by a lack of user-friendly tools +to explain their internal operations and prediction rationales. To tackle this +issue, we introduce the survex R package, which provides a cohesive framework +for explaining any survival model by applying explainable artificial +intelligence techniques. The capabilities of the proposed software encompass +understanding and diagnosing survival models, which can lead to their +improvement. By revealing insights into the decision-making process, such as +variable effects and importances, survex enables the assessment of model +reliability and the detection of biases. Thus, transparency and responsibility +may be promoted in sensitive areas, such as biomedical research and healthcare +applications. + +
+
+
+
+
+ + ☆ Advanced Deep Regression Models for Forecasting Time Series Oil + Production + + +
+ Global oil demand is rapidly increasing and is expected to reach 106.3 +million barrels per day by 2040. Thus, it is vital for hydrocarbon extraction +industries to forecast their production to optimize their operations and avoid +losses. Big companies have realized that exploiting the power of deep learning +(DL) and the massive amount of data from various oil wells for this purpose can +save a lot of operational costs and reduce unwanted environmental impacts. In +this direction, researchers have proposed models using conventional machine +learning (ML) techniques for oil production forecasting. However, these +techniques are inappropriate for this problem as they can not capture +historical patterns found in time series data, resulting in inaccurate +predictions. This research aims to overcome these issues by developing advanced +data-driven regression models using sequential convolutions and long short-term +memory (LSTM) units. Exhaustive analyses are conducted to select the optimal +sequence length, model hyperparameters, and cross-well dataset formation to +build highly generalized robust models. A comprehensive experimental study on +Volve oilfield data validates the proposed models. It reveals that the +LSTM-based sequence learning model can predict oil production better than the +1-D convolutional neural network (CNN) with mean absolute error (MAE) and R2 +score of 111.16 and 0.98, respectively. It is also found that the LSTM-based +model performs better than all the existing state-of-the-art solutions and +achieves a 37% improvement compared to a standard linear regression, which is +considered the baseline model in this work. + +
+
+
+
+
+ + ☆ Application of Zone Method based Machine Learning and Physics-Informed + Neural Networks in Reheating Furnaces + + +
+ Despite the high economic relevance of Foundation Industries, certain +components like Reheating furnaces within their manufacturing chain are +energy-intensive. Notable energy consumption reduction could be obtained by +reducing the overall heating time in furnaces. Computer-integrated Machine +Learning (ML) and Artificial Intelligence (AI) powered control systems in +furnaces could be enablers in achieving the Net-Zero goals in Foundation +Industries for sustainable manufacturing. + In this work, due to the infeasibility of achieving good quality data in +scenarios like reheating furnaces, classical Hottel's zone method based +computational model has been used to generate data for ML and Deep Learning +(DL) based model training via regression. It should be noted that the zone +method provides an elegant way to model the physical phenomenon of Radiative +Heat Transfer (RHT), the dominating heat transfer mechanism in high-temperature +processes inside heating furnaces. Using this data, an extensive comparison +among a wide range of state-of-the-art, representative ML and DL methods has +been made against their temperature prediction performances in varying furnace +environments. Owing to their holistic balance among inference times and model +performance, DL stands out among its counterparts. To further enhance the +Out-Of-Distribution (OOD) generalization capability of the trained DL models, +we propose a Physics-Informed Neural Network (PINN) by incorporating prior +physical knowledge using a set of novel Energy-Balance regularizers. Our setup +is a generic framework, is geometry-agnostic of the 3D structure of the +underlying furnace, and as such could accommodate any standard ML regression +model, to serve as a Digital Twin of the underlying physical processes, for +transitioning Foundation Industries towards Industry 4.0. + +
+
+
+
+
+ + ☆ Consensus of state of the art mortality prediction models: From + all-cause mortality to sudden death prediction + + +
+ Worldwide, many millions of people die suddenly and unexpectedly each year, +either with or without a prior history of cardiovascular disease. Such events +are sparse (once in a lifetime), many victims will not have had prior +investigations for cardiac disease and many different definitions of sudden +death exist. Accordingly, sudden death is hard to predict. + This analysis used NHS Electronic Health Records (EHRs) for people aged +$\geq$50 years living in the Greater Glasgow and Clyde (GG\&C) region in 2010 +(n = 380,000) to try to overcome these challenges. We investigated whether +medical history, blood tests, prescription of medicines, and hospitalisations +might, in combination, predict a heightened risk of sudden death. + We compared the performance of models trained to predict either sudden death +or all-cause mortality. We built six models for each outcome of interest: three +taken from state-of-the-art research (BEHRT, Deepr and Deep Patient), and three +of our own creation. We trained these using two different data representations: +a language-based representation, and a sparse temporal matrix. + We used global interpretability to understand the most important features of +each model, and compare how much agreement there was amongst models using Rank +Biased Overlap. It is challenging to account for correlated variables without +increasing the complexity of the interpretability technique. We overcame this +by clustering features into groups and comparing the most important groups for +each model. We found the agreement between models to be much higher when +accounting for correlated variables. + Our analysis emphasises the challenge of predicting sudden death and +emphasises the need for better understanding and interpretation of machine +learning models applied to healthcare applications. + +
+
+
+
+
+ + ☆ Conti Inc.: Understanding the Internal Discussions of a large + Ransomware-as-a-Service Operator with Machine Learning + + +
+ Ransomware-as-a-service (RaaS) is increasing the scale and complexity of +ransomware attacks. Understanding the internal operations behind RaaS has been +a challenge due to the illegality of such activities. The recent chat leak of +the Conti RaaS operator, one of the most infamous ransomware operators on the +international scene, offers a key opportunity to better understand the inner +workings of such organizations. This paper analyzes the main topic discussions +in the Conti chat leak using machine learning techniques such as Natural +Language Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as +visualization strategies. Five discussion topics are found: 1) Business, 2) +Technical, 3) Internal tasking/Management, 4) Malware, and 5) Customer +Service/Problem Solving. Moreover, the distribution of topics among Conti +members shows that only 4% of individuals have specialized discussions while +almost all individuals (96%) are all-rounders, meaning that their discussions +revolve around the five topics. The results also indicate that a significant +proportion of Conti discussions are non-tech related. This study thus +highlights that running such large RaaS operations requires a workforce skilled +beyond technical abilities, with individuals involved in various tasks, from +management to customer service or problem solving. The discussion topics also +show that the organization behind the Conti RaaS oper5086933ator shares +similarities with a large firm. We conclude that, although RaaS represents an +example of specialization in the cybercrime industry, only a few members are +specialized in one topic, while the rest runs and coordinates the RaaS +operation. + +
+
+
+
+
+ + ☆ A Parameter-Free Two-Bit Covariance Estimator with Improved Operator + Norm Error Rate + + +
+ A covariance matrix estimator using two bits per entry was recently developed +by Dirksen, Maly and Rauhut [Annals of Statistics, 50(6), pp. 3538-3562]. The +estimator achieves near minimax rate for general sub-Gaussian distributions, +but also suffers from two downsides: theoretically, there is an essential gap +on operator norm error between their estimator and sample covariance when the +diagonal of the covariance matrix is dominated by only a few entries; +practically, its performance heavily relies on the dithering scale, which needs +to be tuned according to some unknown parameters. In this work, we propose a +new 2-bit covariance matrix estimator that simultaneously addresses both +issues. Unlike the sign quantizer associated with uniform dither in Dirksen et +al., we adopt a triangular dither prior to a 2-bit quantizer inspired by the +multi-bit uniform quantizer. By employing dithering scales varying across +entries, our estimator enjoys an improved operator norm error rate that depends +on the effective rank of the underlying covariance matrix rather than the +ambient dimension, thus closing the theoretical gap. Moreover, our proposed +method eliminates the need of any tuning parameter, as the dithering scales are +entirely determined by the data. Experimental results under Gaussian samples +are provided to showcase the impressive numerical performance of our estimator. +Remarkably, by halving the dithering scales, our estimator oftentimes achieves +operator norm errors less than twice of the errors of sample covariance. + +
+
+ comment: 24 pages, 2 figures +
+
+
+
+
+ + ☆ Low-Rank Multitask Learning based on Tensorized SVMs and LSSVMs + + +
+ Multitask learning (MTL) leverages task-relatedness to enhance performance. +With the emergence of multimodal data, tasks can now be referenced by multiple +indices. In this paper, we employ high-order tensors, with each mode +corresponding to a task index, to naturally represent tasks referenced by +multiple indices and preserve their structural relations. Based on this +representation, we propose a general framework of low-rank MTL methods with +tensorized support vector machines (SVMs) and least square support vector +machines (LSSVMs), where the CP factorization is deployed over the coefficient +tensor. Our approach allows to model the task relation through a linear +combination of shared factors weighted by task-specific factors and is +generalized to both classification and regression problems. Through the +alternating optimization scheme and the Lagrangian function, each subproblem is +transformed into a convex problem, formulated as a quadratic programming or +linear system in the dual form. In contrast to previous MTL frameworks, our +decision function in the dual induces a weighted kernel function with a +task-coupling term characterized by the similarities of the task-specific +factors, better revealing the explicit relations across tasks in MTL. +Experimental results validate the effectiveness and superiority of our proposed +methods compared to existing state-of-the-art approaches in MTL. The code of +implementation will be available at https://github.com/liujiani0216/TSVM-MTL. + +
+
+
+
+
+ + ☆ PAVI: Plate-Amortized Variational Inference + + +
+ Given observed data and a probabilistic generative model, Bayesian inference +searches for the distribution of the model's parameters that could have yielded +the data. Inference is challenging for large population studies where millions +of measurements are performed over a cohort of hundreds of subjects, resulting +in a massive parameter space. This large cardinality renders off-the-shelf +Variational Inference (VI) computationally impractical. + In this work, we design structured VI families that efficiently tackle large +population studies. Our main idea is to share the parameterization and learning +across the different i.i.d. variables in a generative model, symbolized by the +model's \textit{plates}. We name this concept \textit{plate amortization}. +Contrary to off-the-shelf stochastic VI, which slows down inference, plate +amortization results in orders of magnitude faster to train variational +distributions. + Applied to large-scale hierarchical problems, PAVI yields expressive, +parsimoniously parameterized VI with an affordable training time. This faster +convergence effectively unlocks inference in those large regimes. We illustrate +the practical utility of PAVI through a challenging Neuroimaging example +featuring 400 million latent parameters, demonstrating a significant step +towards scalable and expressive Variational Inference. + +
+
+
+
+
+ + ☆ EnsembleFollower: A Hybrid Car-Following Framework Based On + Reinforcement Learning and Hierarchical Planning + + +
+ Car-following models have made significant contributions to our understanding +of longitudinal driving behavior. However, they often exhibit limited accuracy +and flexibility, as they cannot fully capture the complexity inherent in +car-following processes, or may falter in unseen scenarios due to their +reliance on confined driving skills present in training data. It is worth +noting that each car-following model possesses its own strengths and weaknesses +depending on specific driving scenarios. Therefore, we propose +EnsembleFollower, a hierarchical planning framework for achieving advanced +human-like car-following. The EnsembleFollower framework involves a high-level +Reinforcement Learning-based agent responsible for judiciously managing +multiple low-level car-following models according to the current state, either +by selecting an appropriate low-level model to perform an action or by +allocating different weights across all low-level components. Moreover, we +propose a jerk-constrained kinematic model for more convincing car-following +simulations. We evaluate the proposed method based on real-world driving data +from the HighD dataset. The experimental results illustrate that +EnsembleFollower yields improved accuracy of human-like behavior and achieves +effectiveness in combining hybrid models, demonstrating that our proposed +framework can handle diverse car-following conditions by leveraging the +strengths of various low-level models. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ FPTQ: Fine-grained Post-Training Quantization for Large Language Models + + +
+ In the era of large-scale language models, the substantial parameter size +poses significant challenges for deployment. Being a prevalent compression +technique, quantization has emerged as the mainstream practice to tackle this +issue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and +activations in such bit widths). In this study, we propose a novel W4A8 +post-training quantization method for the available open-sourced LLMs, which +combines the advantages of both two recipes. Therefore, we can leverage the +benefit in the I/O utilization of 4-bit weight quantization and the +acceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces +notorious performance degradation. As a remedy, we involve layerwise activation +quantization strategies which feature a novel logarithmic equalization for most +intractable layers, and we combine them with fine-grained weight quantization. +Without whistles and bells, we eliminate the necessity for further fine-tuning +and obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and +LLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is +achievable for the deployment of large language models, fostering their +wide-spreading real-world applications. + +
+
+
+
+
+ + ☆ Learning Structure-from-Motion with Graph Attention Networks + + +
+ In this paper we tackle the problem of learning Structure-from-Motion (SfM) +through the use of graph attention networks. SfM is a classic computer vision +problem that is solved though iterative minimization of reprojection errors, +referred to as Bundle Adjustment (BA), starting from a good initialization. In +order to obtain a good enough initialization to BA, conventional methods rely +on a sequence of sub-problems (such as pairwise pose estimation, pose averaging +or triangulation) which provides an initial solution that can then be refined +using BA. In this work we replace these sub-problems by learning a model that +takes as input the 2D keypoints detected across multiple views, and outputs the +corresponding camera poses and 3D keypoint coordinates. Our model takes +advantage of graph neural networks to learn SfM-specific primitives, and we +show that it can be used for fast inference of the reconstruction for new and +unseen sequences. The experimental results show that the proposed model +outperforms competing learning-based methods, and challenges COLMAP while +having lower runtime. + +
+
+
+
+
+ + ☆ Demo: A Digital Twin of the 5G Radio Access Network for Anomaly + Detection Functionality + + +
+ Recently, the concept of digital twins (DTs) has received significant +attention within the realm of 5G/6G. This demonstration shows an innovative DT +design and implementation framework tailored toward integration within the 5G +infrastructure. The proposed DT enables near real-time anomaly detection +capability pertaining to user connectivity. It empowers the 5G system to +proactively execute decisions for resource control and connection restoration. + +
+
+ comment: 2 pages, 2 figures. This paper has been accepted by the 31st IEEE + International Conference on Network Protocols (ICNP 2023) +
+
+
+
+
+ + ☆ Jaccard-constrained dense subgraph discovery + + +
+ Finding dense subgraphs is a core problem in graph mining with many +applications in diverse domains. At the same time many real-world networks vary +over time, that is, the dataset can be represented as a sequence of graph +snapshots. Hence, it is natural to consider the question of finding dense +subgraphs in a temporal network that are allowed to vary over time to a certain +degree. In this paper, we search for dense subgraphs that have large pairwise +Jaccard similarity coefficients. More formally, given a set of graph snapshots +and a weight $\lambda$, we find a collection of dense subgraphs such that the +sum of densities of the induced subgraphs plus the sum of Jaccard indices, +weighted by $\lambda$, is maximized. We prove that this problem is NP-hard. To +discover dense subgraphs with good objective value, we present an iterative +algorithm which runs in $\mathcal{O}(n^2k^2 + m \log n + k^3 n)$ time per +single iteration, and a greedy algorithm which runs in $\mathcal{O}(n^2k^2 + m +\log n + k^3 n)$ time, where $k$ is the length of the graph sequence and $n$ +and $m$ denote number of nodes and total number of edges respectively. We show +experimentally that our algorithms are efficient, they can find ground truth in +synthetic datasets and provide interpretable results from real-world datasets. +Finally, we present a case study that shows the usefulness of our problem. + +
+
+
+
+
+ + ☆ LLaSM: Large Language and Speech Model + + +
+ Multi-modal large language models have garnered significant interest +recently. Though, most of the works focus on vision-language multi-modal models +providing strong capabilities in following vision-and-language instructions. +However, we claim that speech is also an important modality through which +humans interact with the world. Hence, it is crucial for a general-purpose +assistant to be able to follow multi-modal speech-and-language instructions. In +this work, we propose Large Language and Speech Model (LLaSM). LLaSM is an +end-to-end trained large multi-modal speech-language model with cross-modal +conversational abilities, capable of following speech-and-language +instructions. Our early experiments show that LLaSM demonstrates a more +convenient and natural way for humans to interact with artificial intelligence. +Specifically, we also release a large Speech Instruction Following dataset +LLaSM-Audio-Instructions. Code and demo are available at +https://github.com/LinkSoul-AI/LLaSM and +https://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions +dataset is available at +https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions. + +
+
+
+
+
+ + ☆ Cyclophobic Reinforcement Learning + + +
+ In environments with sparse rewards, finding a good inductive bias for +exploration is crucial to the agent's success. However, there are two competing +goals: novelty search and systematic exploration. While existing approaches +such as curiosity-driven exploration find novelty, they sometimes do not +systematically explore the whole state space, akin to depth-first-search vs +breadth-first-search. In this paper, we propose a new intrinsic reward that is +cyclophobic, i.e., it does not reward novelty, but punishes redundancy by +avoiding cycles. Augmenting the cyclophobic intrinsic reward with a sequence of +hierarchical representations based on the agent's cropped observations we are +able to achieve excellent results in the MiniGrid and MiniHack environments. +Both are particularly hard, as they require complex interactions with different +objects in order to be solved. Detailed comparisons with previous approaches +and thorough ablation studies show that our newly proposed cyclophobic +reinforcement learning is more sample efficient than other state of the art +methods in a variety of tasks. + +
+
+ comment: Published in Transactions on Machine Learning Research (08/2023) +
+
+
+
+
+ + ☆ Thermodynamic Computing via Autonomous Quantum Thermal Machines + + +
+ We develop a physics-based model for classical computation based on +autonomous quantum thermal machines. These machines consist of few interacting +quantum bits (qubits) connected to several environments at different +temperatures. Heat flows through the machine are here exploited for computing. +The process starts by setting the temperatures of the environments according to +the logical input. The machine evolves, eventually reaching a non-equilibrium +steady state, from which the output of the computation can be determined via +the temperature of an auxilliary finite-size reservoir. Such a machine, which +we term a "thermodynamic neuron", can implement any linearly-separable +function, and we discuss explicitly the cases of NOT, 3-majority and NOR gates. +In turn, we show that a network of thermodynamic neurons can perform any +desired function. We discuss the close connection between our model and +artificial neurons (perceptrons), and argue that our model provides an +alternative physics-based analogue implementation of neural networks, and more +generally a platform for thermodynamic computing. + +
+
+ comment: 12 + 4 pages. Comments welcome! +
+
+
+
+
+ + ☆ Beyond Traditional Neural Networks: Toward adding Reasoning and Learning + Capabilities through Computational Logic Techniques + + +
+ Deep Learning (DL) models have become popular for solving complex problems, +but they have limitations such as the need for high-quality training data, lack +of transparency, and robustness issues. Neuro-Symbolic AI has emerged as a +promising approach combining the strengths of neural networks and symbolic +reasoning. Symbolic knowledge injection (SKI) techniques are a popular method +to incorporate symbolic knowledge into sub-symbolic systems. This work proposes +solutions to improve the knowledge injection process and integrate elements of +ML and logic into multi-agent systems (MAS). + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ On the Potential of CLIP for Compositional Logical Reasoning + + +
+ In this paper we explore the possibility of using OpenAI's CLIP to perform +logically coherent grounded visual reasoning. To that end, we formalize our +terms and give a geometric analysis of how embeddings in CLIP's latent space +would need to be configured in order for the system to be logically coherent. +Our main conclusion is that, as usually configured, CLIP cannot perform such +reasoning. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Towards One-Shot Learning for Text Classification using Inductive Logic + Programming + + +
+ With the ever-increasing potential of AI to perform personalised tasks, it is +becoming essential to develop new machine learning techniques which are +data-efficient and do not require hundreds or thousands of training data. In +this paper, we explore an Inductive Logic Programming approach for one-shot +text classification. In particular, we explore the framework of +Meta-Interpretive Learning (MIL), along with using common-sense background +knowledge extracted from ConceptNet. Results indicate that MIL can learn text +classification rules from a small number of training examples. Moreover, the +higher complexity of chosen examples, the higher accuracy of the outcome. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ "Would life be more interesting if I were in AI?" Answering + Counterfactuals based on Probabilistic Inductive Logic Programming + + +
+ Probabilistic logic programs are logic programs where some facts hold with a +specified probability. Here, we investigate these programs with a causal +framework that allows counterfactual queries. Learning the program structure +from observational data is usually done through heuristic search relying on +statistical tests. However, these statistical tests lack information about the +causal mechanism generating the data, which makes it unfeasible to use the +resulting programs for counterfactual reasoning. To address this, we propose a +language fragment that allows reconstructing a program from its induced +distribution. This further enables us to learn programs supporting +counterfactual queries. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Minimum Width for Deep, Narrow MLP: A Diffeomorphism and the Whitney + Embedding Theorem Approach + + +
+ Recently, there has been significant attention on determining the minimum +width for the universal approximation property of deep, narrow MLPs. Among +these challenges, approximating a continuous function under the uniform norm is +important and challenging, with the gap between its lower and upper bound being +hard to narrow. In this regard, we propose a novel upper bound for the minimum +width, given by $\operatorname{max}(2d_x+1, d_y) + \alpha(\sigma)$, to achieve +uniform approximation in deep narrow MLPs, where $0\leq \alpha(\sigma)\leq 2$ +represents the constant depending on the activation function. We demonstrate +this bound through two key proofs. First, we establish that deep, narrow MLPs +with little additional width can approximate diffeomorphisms. Secondly, we +utilize the Whitney embedding theorem to show that any continuous function can +be approximated by embeddings, further decomposed into linear transformations +and diffeomorphisms. + +
+
+
+
+
+ + ☆ Domain Generalization without Excess Empirical Risk NeurIPS 2022 + + +
+ Given data from diverse sets of distinct distributions, domain generalization +aims to learn models that generalize to unseen distributions. A common approach +is designing a data-driven surrogate penalty to capture generalization and +minimize the empirical risk jointly with the penalty. We argue that a +significant failure mode of this recipe is an excess risk due to an erroneous +penalty or hardness in joint optimization. We present an approach that +eliminates this problem. Instead of jointly minimizing empirical risk with the +penalty, we minimize the penalty under the constraint of optimality of the +empirical risk. This change guarantees that the domain generalization penalty +cannot impair optimization of the empirical risk, i.e., in-distribution +performance. To solve the proposed optimization problem, we demonstrate an +exciting connection to rate-distortion theory and utilize its tools to design +an efficient method. Our approach can be applied to any penalty-based domain +generalization method, and we demonstrate its effectiveness by applying it to +three examplar methods from the literature, showing significant improvements. + +
+
+ comment: Published at NeurIPS 2022 +
+
+
+
+
+ + ☆ MSGNN: Multi-scale Spatio-temporal Graph Neural Network for Epidemic + Forecasting + + +
+ Infectious disease forecasting has been a key focus and proved to be crucial +in controlling epidemic. A recent trend is to develop forecast-ing models based +on graph neural networks (GNNs). However, existing GNN-based methods suffer +from two key limitations: (1) Current models broaden receptive fields by +scaling the depth of GNNs, which is insuffi-cient to preserve the semantics of +long-range connectivity between distant but epidemic related areas. (2) +Previous approaches model epidemics within single spatial scale, while ignoring +the multi-scale epidemic pat-terns derived from different scales. To address +these deficiencies, we devise the Multi-scale Spatio-temporal Graph Neural +Network (MSGNN) based on an innovative multi-scale view. To be specific, in the +proposed MSGNN model, we first devise a novel graph learning module, which +directly captures long-range connectivity from trans-regional epidemic signals +and integrates them into a multi-scale graph. Based on the learned multi-scale +graph, we utilize a newly designed graph convolution module to exploit +multi-scale epidemic patterns. This module allows us to facilitate multi-scale +epidemic modeling by mining both scale-shared and scale-specific pat-terns. +Experimental results on forecasting new cases of COVID-19 in United State +demonstrate the superiority of our method over state-of-arts. Further analyses +and visualization also show that MSGNN offers not only accurate, but also +robust and interpretable forecasting result. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Adaptive Lasso, Transfer Lasso, and Beyond: An Asymptotic Perspective + + +
+ This paper presents a comprehensive exploration of the theoretical properties +inherent in the Adaptive Lasso and the Transfer Lasso. The Adaptive Lasso, a +well-established method, employs regularization divided by initial estimators +and is characterized by asymptotic normality and variable selection +consistency. In contrast, the recently proposed Transfer Lasso employs +regularization subtracted by initial estimators with the demonstrated capacity +to curtail non-asymptotic estimation errors. A pivotal question thus emerges: +Given the distinct ways the Adaptive Lasso and the Transfer Lasso employ +initial estimators, what benefits or drawbacks does this disparity confer upon +each method? This paper conducts a theoretical examination of the asymptotic +properties of the Transfer Lasso, thereby elucidating its differentiation from +the Adaptive Lasso. Informed by the findings of this analysis, we introduce a +novel method, one that amalgamates the strengths and compensates for the +weaknesses of both methods. The paper concludes with validations of our theory +and comparisons of the methods via simulation experiments. + +
+
+
+
+
+ + ☆ Federated Two Stage Decoupling With Adaptive Personalization Layers + + +
+ Federated learning has gained significant attention due to its groundbreaking +ability to enable distributed learning while maintaining privacy constraints. +However, as a consequence of data heterogeneity among decentralized devices, it +inherently experiences significant learning degradation and slow convergence +speed. Therefore, it is natural to employ the concept of clustering homogeneous +clients into the same group, allowing only the model weights within each group +to be aggregated. While most existing clustered federated learning methods +employ either model gradients or inference outputs as metrics for client +partitioning, with the goal of grouping similar devices together, may still +have heterogeneity within each cluster. Moreover, there is a scarcity of +research exploring the underlying reasons for determining the appropriate +timing for clustering, resulting in the common practice of assigning each +client to its own individual cluster, particularly in the context of highly non +independent and identically distributed (Non-IID) data. In this paper, we +introduce a two-stage decoupling federated learning algorithm with adaptive +personalization layers named FedTSDP, where client clustering is performed +twice according to inference outputs and model weights, respectively. Hopkins +amended sampling is adopted to determine the appropriate timing for clustering +and the sampling weight of public unlabeled data. In addition, a simple yet +effective approach is developed to adaptively adjust the personalization layers +based on varying degrees of data skew. Experimental results show that our +proposed method has reliable performance on both IID and non-IID scenarios. + +
+
+
+
+
+ + ☆ Peering Through Preferences: Unraveling Feedback Acquisition for + Aligning Large Language Models + + +
+ Aligning large language models (LLMs) with human values and intents +critically involves the use of human or AI feedback. While dense feedback +annotations are expensive to acquire and integrate, sparse feedback presents a +structural design choice between ratings (e.g., score Response A on a scale of +1-7) and rankings (e.g., is Response A better than Response B?). In this work, +we analyze the effect of this design choice for the alignment and evaluation of +LLMs. We uncover an inconsistency problem wherein the preferences inferred from +ratings and rankings significantly disagree 60% for both human and AI +annotators. Our subsequent analysis identifies various facets of annotator +biases that explain this phenomena, such as human annotators would rate denser +responses higher while preferring accuracy during pairwise judgments. To our +surprise, we also observe that the choice of feedback protocol also has a +significant effect on the evaluation of aligned LLMs. In particular, we find +that LLMs that leverage rankings data for alignment (say model X) are preferred +over those that leverage ratings data (say model Y), with a rank-based +evaluation protocol (is X/Y's response better than reference response?) but not +with a rating-based evaluation protocol (score Rank X/Y's response on a scale +of 1-7). Our findings thus shed light on critical gaps in methods for +evaluating the real-world utility of language models and their strong +dependence on the feedback protocol used for alignment. Our code and data are +available at https://github.com/Hritikbansal/sparse_feedback. + +
+
+ comment: 24 pages, 12 Tables, 3 Figures +
+
+
+
+
+ + ☆ HAlf-MAsked Model for Named Entity Sentiment analysis + + +
+ Named Entity Sentiment analysis (NESA) is one of the most actively developing +application domains in Natural Language Processing (NLP). Social media NESA is +a significant field of opinion analysis since detecting and tracking sentiment +trends in the news flow is crucial for building various analytical systems and +monitoring the media image of specific people or companies. In this paper, we +study different transformers-based solutions NESA in RuSentNE-23 evaluation. +Despite the effectiveness of the BERT-like models, they can still struggle with +certain challenges, such as overfitting, which appeared to be the main obstacle +in achieving high accuracy on the RuSentNE-23 data. We present several +approaches to overcome this problem, among which there is a novel technique of +additional pass over given data with masked entity before making the final +prediction so that we can combine logits from the model when it knows the exact +entity it predicts sentiment for and when it does not. Utilizing this +technique, we ensemble multiple BERT- like models trained on different subsets +of data to improve overall performance. Our proposed model achieves the best +result on RuSentNE-23 evaluation data and demonstrates improved consistency in +entity-level sentiment analysis. + +
+
+
+
+
+ + ☆ FedCiR: Client-Invariant Representation Learning for Federated Non-IID + Features + + +
+ Federated learning (FL) is a distributed learning paradigm that maximizes the +potential of data-driven models for edge devices without sharing their raw +data. However, devices often have non-independent and identically distributed +(non-IID) data, meaning their local data distributions can vary significantly. +The heterogeneity in input data distributions across devices, commonly referred +to as the feature shift problem, can adversely impact the training convergence +and accuracy of the global model. To analyze the intrinsic causes of the +feature shift problem, we develop a generalization error bound in FL, which +motivates us to propose FedCiR, a client-invariant representation learning +framework that enables clients to extract informative and client-invariant +features. Specifically, we improve the mutual information term between +representations and labels to encourage representations to carry essential +classification knowledge, and diminish the mutual information term between the +client set and representations conditioned on labels to promote representations +of clients to be client-invariant. We further incorporate two regularizers into +the FL framework to bound the mutual information terms with an approximate +global representation distribution to compensate for the absence of the +ground-truth global representation distribution, thus achieving informative and +client-invariant feature extraction. To achieve global representation +distribution approximation, we propose a data-free mechanism performed by the +server without compromising privacy. Extensive experiments demonstrate the +effectiveness of our approach in achieving client-invariant representation +learning and solving the data heterogeneity issue. + +
+
+
+
+
+ + ☆ Split Without a Leak: Reducing Privacy Leakage in Split Learning + + +
+ The popularity of Deep Learning (DL) makes the privacy of sensitive data more +imperative than ever. As a result, various privacy-preserving techniques have +been implemented to preserve user data privacy in DL. Among various +privacy-preserving techniques, collaborative learning techniques, such as Split +Learning (SL) have been utilized to accelerate the learning and prediction +process. Initially, SL was considered a promising approach to data privacy. +However, subsequent research has demonstrated that SL is susceptible to many +types of attacks and, therefore, it cannot serve as a privacy-preserving +technique. Meanwhile, countermeasures using a combination of SL and encryption +have also been introduced to achieve privacy-preserving deep learning. In this +work, we propose a hybrid approach using SL and Homomorphic Encryption (HE). +The idea behind it is that the client encrypts the activation map (the output +of the split layer between the client and the server) before sending it to the +server. Hence, during both forward and backward propagation, the server cannot +reconstruct the client's input data from the intermediate activation map. This +improvement is important as it reduces privacy leakage compared to other +SL-based works, where the server can gain valuable information about the +client's input. In addition, on the MIT-BIH dataset, our proposed hybrid +approach using SL and HE yields faster training time (about 6 times) and +significantly reduced communication overhead (almost 160 times) compared to +other HE-based approaches, thereby offering improved privacy protection for +sensitive data in DL. + +
+
+
+
+
+ + ☆ Efficient and Explainable Graph Neural Architecture Search via + Monte-Carlo Tree Search + + +
+ Graph neural networks (GNNs) are powerful tools for performing data science +tasks in various domains. Although we use GNNs in wide application scenarios, +it is a laborious task for researchers and practitioners to design/select +optimal GNN rchitectures in diverse graphs. To save human efforts and +computational costs, graph neural architecture search (Graph NAS) has been used +to search for a sub-optimal GNN architecture that combines existing components. +However, there are no existing Graph NAS methods that satisfy explainability, +efficiency, and adaptability to various graphs. Therefore, we propose an +efficient and explainable Graph NAS method, called ExGNAS, which consists of +(i) a simple search space that can adapt to various graphs and (ii) a search +algorithm that makes the decision process explainable. The search space +includes only fundamental functions that can handle homophilic and heterophilic +graphs. The search algorithm efficiently searches for the best GNN architecture +via Monte-Carlo tree search without neural models. The combination of our +search space and algorithm achieves finding accurate GNN models and the +important functions within the search space. We comprehensively evaluate our +method compared with twelve hand-crafted GNN architectures and three Graph NAS +methods in four graphs. Our experimental results show that ExGNAS increases AUC +up to 3.6 and reduces run time up to 78\% compared with the state-of-the-art +Graph NAS methods. Furthermore, we show ExGNAS is effective in analyzing the +difference between GNN architectures in homophilic and heterophilic graphs. + +
+
+
+
+
+ + ☆ Fully Embedded Time-Series Generative Adversarial Networks + + +
+ Generative Adversarial Networks (GANs) should produce synthetic data that +fits the underlying distribution of the data being modeled. For real valued +time-series data, this implies the need to simultaneously capture the static +distribution of the data, but also the full temporal distribution of the data +for any potential time horizon. This temporal element produces a more complex +problem that can potentially leave current solutions under-constrained, +unstable during training, or prone to varying degrees of mode collapse. In +FETSGAN, entire sequences are translated directly to the generator's sampling +space using a seq2seq style adversarial auto encoder (AAE), where adversarial +training is used to match the training distribution in both the feature space +and the lower dimensional sampling space. This additional constraint provides a +loose assurance that the temporal distribution of the synthetic samples will +not collapse. In addition, the First Above Threshold (FAT) operator is +introduced to supplement the reconstruction of encoded sequences, which +improves training stability and the overall quality of the synthetic data being +generated. These novel contributions demonstrate a significant improvement to +the current state of the art for adversarial learners in qualitative measures +of temporal similarity and quantitative predictive ability of data generated +through FETSGAN. + +
+
+
+
+
+ + ☆ Surrogate-based Autotuning for Randomized Sketching Algorithms in + Regression Problems + + +
+ Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be +effective in handling high-dimensional computational problems, providing +high-quality empirical performance as well as strong probabilistic guarantees. +However, their practical application is complicated by the fact that the user +needs to set various algorithm-specific tuning parameters which are different +than those used in traditional NLA. This paper demonstrates how a +surrogate-based autotuning approach can be used to address fundamental problems +of parameter selection in RandNLA algorithms. In particular, we provide a +detailed investigation of surrogate-based autotuning for +sketch-and-precondition (SAP) based randomized least squares methods, which +have been one of the great success stories in modern RandNLA. Empirical results +show that our surrogate-based autotuning approach can achieve near-optimal +performance with much less tuning cost than a random search (up to about 4x +fewer trials of different parameter configurations). Moreover, while our +experiments focus on least squares, our results demonstrate a general-purpose +autotuning pipeline applicable to any kind of RandNLA algorithm. + +
+
+
+
+
+ + ☆ Exploring Deep Learning for Full-disk Solar Flare Prediction with + Empirical Insights from Guided Grad-CAM Explanations + + +
+ This study progresses solar flare prediction research by presenting a +full-disk deep-learning model to forecast $\geq$M-class solar flares and +evaluating its efficacy on both central (within $\pm$70$^\circ$) and near-limb +(beyond $\pm$70$^\circ$) events, showcasing qualitative assessment of post hoc +explanations for the model's predictions, and providing empirical findings from +human-centered quantitative assessments of these explanations. Our model is +trained using hourly full-disk line-of-sight magnetogram images to predict +$\geq$M-class solar flares within the subsequent 24-hour prediction window. +Additionally, we apply the Guided Gradient-weighted Class Activation Mapping +(Guided Grad-CAM) attribution method to interpret our model's predictions and +evaluate the explanations. Our analysis unveils that full-disk solar flare +predictions correspond with active region characteristics. The following points +represent the most important findings of our study: (1) Our deep learning +models achieved an average true skill statistic (TSS) of $\sim$0.51 and a +Heidke skill score (HSS) of $\sim$0.38, exhibiting skill to predict solar +flares where for central locations the average recall is $\sim$0.75 (recall +values for X- and M-class are 0.95 and 0.73 respectively) and for the near-limb +flares the average recall is $\sim$0.52 (recall values for X- and M-class are +0.74 and 0.50 respectively); (2) qualitative examination of the model's +explanations reveals that it discerns and leverages features linked to active +regions in both central and near-limb locations within full-disk magnetograms +to produce respective predictions. In essence, our models grasp the shape and +texture-based properties of flaring active regions, even in proximity to limb +areas -- a novel and essential capability with considerable significance for +operational forecasting systems. + +
+
+ comment: This is a preprint accepted at the 10th IEEE International Conference + On Data Science And Advanced Analytics (DSAA 2023). The conference + proceedings will be published by the IEEE Xplore Digital Library with ISBN: + 979-8-3503-4503-2. 10 pages, 6 figures +
+
+
+
+
+ + ☆ Speech Wikimedia: A 77 Language Multilingual Speech Dataset ICML + + +
+ The Speech Wikimedia Dataset is a publicly available compilation of audio +with transcriptions extracted from Wikimedia Commons. It includes 1780 hours +(195 GB) of CC-BY-SA licensed transcribed speech from a diverse set of +scenarios and speakers, in 77 different languages. Each audio file has one or +more transcriptions in different languages, making this dataset suitable for +training speech recognition, speech translation, and machine translation +models. + +
+
+ comment: Data-Centric Machine Learning Workshop at the International Machine + Learning Conference 2023 (ICML) +
+
+
+
+
+ + ☆ Threshold KNN-Shapley: A Linear-Time and Privacy-Friendly Approach to + Data Valuation + + +
+ Data valuation, a critical aspect of data-centric ML research, aims to +quantify the usefulness of individual data sources in training machine learning +(ML) models. However, data valuation faces significant yet frequently +overlooked privacy challenges despite its importance. This paper studies these +challenges with a focus on KNN-Shapley, one of the most practical data +valuation methods nowadays. We first emphasize the inherent privacy risks of +KNN-Shapley, and demonstrate the significant technical difficulties in adapting +KNN-Shapley to accommodate differential privacy (DP). To overcome these +challenges, we introduce TKNN-Shapley, a refined variant of KNN-Shapley that is +privacy-friendly, allowing for straightforward modifications to incorporate DP +guarantee (DP-TKNN-Shapley). We show that DP-TKNN-Shapley has several +advantages and offers a superior privacy-utility tradeoff compared to naively +privatized KNN-Shapley in discerning data quality. Moreover, even non-private +TKNN-Shapley achieves comparable performance as KNN-Shapley. Overall, our +findings suggest that TKNN-Shapley is a promising alternative to KNN-Shapley, +particularly for real-world applications involving sensitive data. + +
+
+
+
+
+ + ☆ Towards a Rigorous Analysis of Mutual Information in Contrastive + Learning + + +
+ Contrastive learning has emerged as a cornerstone in recent achievements of +unsupervised representation learning. Its primary paradigm involves an instance +discrimination task with a mutual information loss. The loss is known as +InfoNCE and it has yielded vital insights into contrastive learning through the +lens of mutual information analysis. However, the estimation of mutual +information can prove challenging, creating a gap between the elegance of its +mathematical foundation and the complexity of its estimation. As a result, +drawing rigorous insights or conclusions from mutual information analysis +becomes intricate. In this study, we introduce three novel methods and a few +related theorems, aimed at enhancing the rigor of mutual information analysis. +Despite their simplicity, these methods can carry substantial utility. +Leveraging these approaches, we reassess three instances of contrastive +learning analysis, illustrating their capacity to facilitate deeper +comprehension or to rectify pre-existing misconceptions. Specifically, we +investigate small batch size, mutual information as a measure, and the InfoMin +principle. + +
+
+ comment: 18 pages, 7 figures, Under review +
+
+
+
+
+ + ☆ Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling + Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate + Prediction CIKM 2023 + + +
+ Spatial-temporal information has been proven to be of great significance for +click-through rate prediction tasks in online Location-Based Services (LBS), +especially in mainstream food ordering platforms such as DoorDash, Uber Eats, +Meituan, and Ele.me. Modeling user spatial-temporal preferences with sequential +behavior data has become a hot topic in recommendation systems and online +advertising. However, most of existing methods either lack the representation +of rich spatial-temporal information or only handle user behaviors with limited +length, e.g. 100. In this paper, we tackle these problems by designing a new +spatial-temporal modeling paradigm named Fragment and Integrate Network (FIN). +FIN consists of two networks: (i) Fragment Network (FN) extracts Multiple +Sub-Sequences (MSS) from lifelong sequential behavior data, and captures the +specific spatial-temporal representation by modeling each MSS respectively. +Here both a simplified attention and a complicated attention are adopted to +balance the performance gain and resource consumption. (ii) Integrate Network +(IN) builds a new integrated sequence by utilizing spatial-temporal interaction +on MSS and captures the comprehensive spatial-temporal representation by +modeling the integrated sequence with a complicated attention. Both public +datasets and production datasets have demonstrated the accuracy and scalability +of FIN. Since 2022, FIN has been fully deployed in the recommendation +advertising system of Ele.me, one of the most popular online food ordering +platforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and +7.3% increase on Revenue Per Mille (RPM). + +
+
+ comment: Accepted by CIKM 2023 Applied Research Paper +
+
+
+
+
+ + ☆ Training Towards Critical Use: Learning to Situate AI Predictions + Relative to Human Knowledge + + +
+ A growing body of research has explored how to support humans in making +better use of AI-based decision support, including via training and onboarding. +Existing research has focused on decision-making tasks where it is possible to +evaluate "appropriate reliance" by comparing each decision against a ground +truth label that cleanly maps to both the AI's predictive target and the human +decision-maker's goals. However, this assumption does not hold in many +real-world settings where AI tools are deployed today (e.g., social work, +criminal justice, and healthcare). In this paper, we introduce a +process-oriented notion of appropriate reliance called critical use that +centers the human's ability to situate AI predictions against knowledge that is +uniquely available to them but unavailable to the AI model. To explore how +training can support critical use, we conduct a randomized online experiment in +a complex social decision-making setting: child maltreatment screening. We find +that, by providing participants with accelerated, low-stakes opportunities to +practice AI-assisted decision-making in this setting, novices came to exhibit +patterns of disagreement with AI that resemble those of experienced workers. A +qualitative examination of participants' explanations for their AI-assisted +decisions revealed that they drew upon qualitative case narratives, to which +the AI model did not have access, to learn when (not) to rely on AI +predictions. Our findings open new questions for the study and design of +training for real-world AI-assisted decision-making. + +
+
+
+
+
+ + ☆ Segmenting mechanically heterogeneous domains via unsupervised learning + + +
+ From biological organs to soft robotics, highly deformable materials are +essential components of natural and engineered systems. These highly deformable +materials can have heterogeneous material properties, and can experience +heterogeneous deformations with or without underlying material heterogeneity. +Many recent works have established that computational modeling approaches are +well suited for understanding and predicting the consequences of material +heterogeneity and for interpreting observed heterogeneous strain fields. In +particular, there has been significant work towards developing inverse analysis +approaches that can convert observed kinematic quantities (e.g., displacement, +strain) to material properties and mechanical state. Despite the success of +these approaches, they are not necessarily generalizable and often rely on +tight control and knowledge of boundary conditions. Here, we will build on the +recent advances (and ubiquity) of machine learning approaches to explore +alternative approaches to detect patterns in heterogeneous material properties +and mechanical behavior. Specifically, we will explore unsupervised learning +approaches to clustering and ensemble clutering to identify heterogeneous +regions. Overall, we find that these approaches are effective, yet limited in +their abilities. Through this initial exploration (where all data and code is +published alongside this manuscript), we set the stage for future studies that +more specifically adapt these methods to mechanical data. + +
+
+ comment: 26 pages, 10 figures +
+
+
+
+
+ + ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ☆ MDTD: A Multi Domain Trojan Detector for Deep Neural Networks CCS + + +
+ Machine learning models that use deep neural networks (DNNs) are vulnerable +to backdoor attacks. An adversary carrying out a backdoor attack embeds a +predefined perturbation called a trigger into a small subset of input samples +and trains the DNN such that the presence of the trigger in the input results +in an adversary-desired output class. Such adversarial retraining however needs +to ensure that outputs for inputs without the trigger remain unaffected and +provide high classification accuracy on clean samples. In this paper, we +propose MDTD, a Multi-Domain Trojan Detector for DNNs, which detects inputs +containing a Trojan trigger at testing time. MDTD does not require knowledge of +trigger-embedding strategy of the attacker and can be applied to a pre-trained +DNN model with image, audio, or graph-based inputs. MDTD leverages an insight +that input samples containing a Trojan trigger are located relatively farther +away from a decision boundary than clean samples. MDTD estimates the distance +to a decision boundary using adversarial learning methods and uses this +distance to infer whether a test-time input sample is Trojaned or not. We +evaluate MDTD against state-of-the-art Trojan detection methods across five +widely used image-based datasets: CIFAR100, CIFAR10, GTSRB, SVHN, and +Flowers102; four graph-based datasets: AIDS, WinMal, Toxicant, and COLLAB; and +the SpeechCommand audio dataset. MDTD effectively identifies samples that +contain different types of Trojan triggers. We evaluate MDTD against adaptive +attacks where an adversary trains a robust DNN to increase (decrease) distance +of benign (Trojan) inputs from a decision boundary. + +
+
+ comment: Accepted to ACM Conference on Computer and Communications Security + (ACM CCS) 2023 +
+
+
+
+
+ + ☆ A Unified Analysis for the Subgradient Methods Minimizing Composite + Nonconvex, Nonsmooth and Non-Lipschitz Functions + + +
+ In this paper we propose a proximal subgradient method (Prox-SubGrad) for +solving nonconvex and nonsmooth optimization problems without assuming +Lipschitz continuity conditions. A number of subgradient upper bounds and their +relationships are presented. By means of these upper bounding conditions, we +establish some uniform recursive relations for the Moreau envelopes for weakly +convex optimization. This uniform scheme simplifies and unifies the proof +schemes to establish rate of convergence for Prox-SubGrad without assuming +Lipschitz continuity. We present a novel convergence analysis in this context. +Furthermore, we propose some new stochastic subgradient upper bounding +conditions and establish convergence and iteration complexity rates for the +stochastic subgradient method (Sto-SubGrad) to solve non-Lipschitz and +nonsmooth stochastic optimization problems. In particular, for both +deterministic and stochastic subgradient methods on weakly convex optimization +problems without Lipschitz continuity, under any of the subgradient upper +bounding conditions to be introduced in the paper, we show that $O(1/\sqrt{T})$ +convergence rate holds in terms of the square of gradient of the Moreau +envelope function, which further improves to be $O(1/{T})$ if, in addition, the +uniform KL condition with exponent $1/2$ holds. + +
+
+
+
+
+ + ☆ Emoji Promotes Developer Participation and Issue Resolution on GitHub AAAI + + +
+ Although remote working is increasingly adopted during the pandemic, many are +concerned by the low-efficiency in the remote working. Missing in text-based +communication are non-verbal cues such as facial expressions and body language, +which hinders the effective communication and negatively impacts the work +outcomes. Prevalent on social media platforms, emojis, as alternative +non-verbal cues, are gaining popularity in the virtual workspaces well. In this +paper, we study how emoji usage influences developer participation and issue +resolution in virtual workspaces. To this end, we collect GitHub issues for a +one-year period and apply causal inference techniques to measure the causal +effect of emojis on the outcome of issues, controlling for confounders such as +issue content, repository, and author information. We find that emojis can +significantly reduce the resolution time of issues and attract more user +participation. We also compare the heterogeneous effect on different types of +issues. These findings deepen our understanding of the developer communities, +and they provide design implications on how to facilitate interactions and +broaden developer participation. + +
+
+ comment: 12 pages, 5 figures. To be published in the 18th International AAAI + Conference on Web and Social Media (ICWSM 2024) +
+
+
+
+
+ + ☆ ToddlerBERTa: Exploiting BabyBERTa for Grammar Learning and Language + Understanding + + +
+ We present ToddlerBERTa, a BabyBERTa-like language model, exploring its +capabilities through five different models with varied hyperparameters. +Evaluating on BLiMP, SuperGLUE, MSGS, and a Supplement benchmark from the +BabyLM challenge, we find that smaller models can excel in specific tasks, +while larger models perform well with substantial data. Despite training on a +smaller dataset, ToddlerBERTa demonstrates commendable performance, rivalling +the state-of-the-art RoBERTa-base. The model showcases robust language +understanding, even with single-sentence pretraining, and competes with +baselines that leverage broader contextual information. Our work provides +insights into hyperparameter choices, and data utilization, contributing to the +advancement of language models. + +
+
+
+
+
+ + ☆ Symmetry Preservation in Hamiltonian Systems: Simulation and Learning + + +
+ This work presents a general geometric framework for simulating and learning +the dynamics of Hamiltonian systems that are invariant under a Lie group of +transformations. This means that a group of symmetries is known to act on the +system respecting its dynamics and, as a consequence, Noether's Theorem, +conserved quantities are observed. We propose to simulate and learn the +mappings of interest through the construction of $G$-invariant Lagrangian +submanifolds, which are pivotal objects in symplectic geometry. A notable +property of our constructions is that the simulated/learned dynamics also +preserves the same conserved quantities as the original system, resulting in a +more faithful surrogate of the original dynamics than non-symmetry aware +methods, and in a more accurate predictor of non-observed trajectories. +Furthermore, our setting is able to simulate/learn not only Hamiltonian flows, +but any Lie group-equivariant symplectic transformation. Our designs leverage +pivotal techniques and concepts in symplectic geometry and geometric mechanics: +reduction theory, Noether's Theorem, Lagrangian submanifolds, momentum +mappings, and coisotropic reduction among others. We also present methods to +learn Poisson transformations while preserving the underlying geometry and how +to endow non-geometric integrators with geometric properties. Thus, this work +presents a novel attempt to harness the power of symplectic and Poisson +geometry towards simulating and learning problems. + +
+
+ comment: 32 pages, 19 figures +
+
+
+
+
+ + ☆ Ten Years of Generative Adversarial Nets (GANs): A survey of the + state-of-the-art + + +
+ Since their inception in 2014, Generative Adversarial Networks (GANs) have +rapidly emerged as powerful tools for generating realistic and diverse data +across various domains, including computer vision and other applied areas. +Consisting of a discriminative network and a generative network engaged in a +Minimax game, GANs have revolutionized the field of generative modeling. In +February 2018, GAN secured the leading spot on the ``Top Ten Global +Breakthrough Technologies List'' issued by the Massachusetts Science and +Technology Review. Over the years, numerous advancements have been proposed, +leading to a rich array of GAN variants, such as conditional GAN, Wasserstein +GAN, CycleGAN, and StyleGAN, among many others. This survey aims to provide a +general overview of GANs, summarizing the latent architecture, validation +metrics, and application areas of the most widely recognized variants. We also +delve into recent theoretical developments, exploring the profound connection +between the adversarial principle underlying GAN and Jensen-Shannon divergence, +while discussing the optimality characteristics of the GAN framework. The +efficiency of GAN variants and their model architectures will be evaluated +along with training obstacles as well as training solutions. In addition, a +detailed discussion will be provided, examining the integration of GANs with +newly developed deep learning frameworks such as Transformers, Physics-Informed +Neural Networks, Large Language models, and Diffusion models. Finally, we +reveal several issues as well as future research outlines in this field. + +
+
+
+
+
+ + ☆ Classification of Anomalies in Telecommunication Network KPI Time Series + + +
+ The increasing complexity and scale of telecommunication networks have led to +a growing interest in automated anomaly detection systems. However, the +classification of anomalies detected on network Key Performance Indicators +(KPI) has received less attention, resulting in a lack of information about +anomaly characteristics and classification processes. To address this gap, this +paper proposes a modular anomaly classification framework. The framework +assumes separate entities for the anomaly classifier and the detector, allowing +for a distinct treatment of anomaly detection and classification tasks on time +series. The objectives of this study are (1) to develop a time series simulator +that generates synthetic time series resembling real-world network KPI +behavior, (2) to build a detection model to identify anomalies in the time +series, (3) to build classification models that accurately categorize detected +anomalies into predefined classes (4) to evaluate the classification framework +performance on simulated and real-world network KPI time series. This study has +demonstrated the good performance of the anomaly classification models trained +on simulated anomalies when applied to real-world network time series data. + +
+
+
+
+
+ + ☆ Learning Diverse Features in Vision Transformers for Improved + Generalization ICML + + +
+ Deep learning models often rely only on a small set of features even when +there is a rich set of predictive signals in the training data. This makes +models brittle and sensitive to distribution shifts. In this work, we first +examine vision transformers (ViTs) and find that they tend to extract robust +and spurious features with distinct attention heads. As a result of this +modularity, their performance under distribution shifts can be significantly +improved at test time by pruning heads corresponding to spurious features, +which we demonstrate using an "oracle selection" on validation data. Second, we +propose a method to further enhance the diversity and complementarity of the +learned features by encouraging orthogonality of the attention heads' input +gradients. We observe improved out-of-distribution performance on diagnostic +benchmarks (MNIST-CIFAR, Waterbirds) as a consequence of the enhanced diversity +of features and the pruning of undesirable heads. + +
+
+ comment: 2023 ICML Workshop on Spurious Correlations, Invariance and Stability +
+
+
+
+
+ + ☆ A numerical approach for the fractional Laplacian via deep neural + networks + + +
+ We consider the fractional elliptic problem with Dirichlet boundary +conditions on a bounded and convex domain $D$ of $\mathbb{R}^d$, with $d \geq +2$. In this paper, we perform a stochastic gradient descent algorithm that +approximates the solution of the fractional problem via Deep Neural Networks. +Additionally, we provide four numerical examples to test the efficiency of the +algorithm, and each example will be studied for many values of $\alpha \in +(1,2)$ and $d \geq 2$. + +
+
+ comment: 32 pages, 21 figures, 3 tables +
+
+
+
+
+ + ☆ Emergence of Segmentation with Minimalistic White-Box Transformers + + +
+ Transformer-like models for vision tasks have recently proven effective for a +wide range of downstream applications such as segmentation and detection. +Previous works have shown that segmentation properties emerge in vision +transformers (ViTs) trained using self-supervised methods such as DINO, but not +in those trained on supervised classification tasks. In this study, we probe +whether segmentation emerges in transformer-based models solely as a result of +intricate self-supervised learning mechanisms, or if the same emergence can be +achieved under much broader conditions through proper design of the model +architecture. Through extensive experimental results, we demonstrate that when +employing a white-box transformer-like architecture known as CRATE, whose +design explicitly models and pursues low-dimensional structures in the data +distribution, segmentation properties, at both the whole and parts levels, +already emerge with a minimalistic supervised training recipe. Layer-wise +finer-grained analysis reveals that the emergent properties strongly +corroborate the designed mathematical functions of the white-box network. Our +results suggest a path to design white-box foundation models that are +simultaneously highly performant and mathematically fully interpretable. Code +is at \url{https://github.com/Ma-Lab-Berkeley/CRATE}. + +
+
+ comment: Code: https://github.com/Ma-Lab-Berkeley/CRATE +
+
+
+
+
+ + ☆ Materials Informatics Transformer: A Language Model for Interpretable + Materials Properties Prediction + + +
+ Recently, the remarkable capabilities of large language models (LLMs) have +been illustrated across a variety of research domains such as natural language +processing, computer vision, and molecular modeling. We extend this paradigm by +utilizing LLMs for material property prediction by introducing our model +Materials Informatics Transformer (MatInFormer). Specifically, we introduce a +novel approach that involves learning the grammar of crystallography through +the tokenization of pertinent space group information. We further illustrate +the adaptability of MatInFormer by incorporating task-specific data pertaining +to Metal-Organic Frameworks (MOFs). Through attention visualization, we uncover +the key features that the model prioritizes during property prediction. The +effectiveness of our proposed model is empirically validated across 14 distinct +datasets, hereby underscoring its potential for high throughput screening +through accurate material property prediction. + +
+
+
+
+
+ + ♻ ☆ Policy composition in reinforcement learning via multi-objective policy + optimization + + +
+ We enable reinforcement learning agents to learn successful behavior policies +by utilizing relevant pre-existing teacher policies. The teacher policies are +introduced as objectives, in addition to the task objective, in a +multi-objective policy optimization setting. Using the Multi-Objective Maximum +a Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show +that teacher policies can help speed up learning, particularly in the absence +of shaping rewards. In two domains with continuous observation and action +spaces, our agents successfully compose teacher policies in sequence and in +parallel, and are also able to further extend the policies of the teachers in +order to solve the task. + Depending on the specified combination of task and teacher(s), teacher(s) may +naturally act to limit the final performance of an agent. The extent to which +agents are required to adhere to teacher policies are determined by +hyperparameters which determine both the effect of teachers on learning speed +and the eventual performance of the agent on the task. In the humanoid domain +(Tassa et al. 2018), we also equip agents with the ability to control the +selection of teachers. With this ability, agents are able to meaningfully +compose from the teacher policies to achieve a superior task reward on the walk +task than in cases without access to the teacher policies. We show the +resemblance of composed task policies with the corresponding teacher policies +through videos. + +
+
+
+
+
+ + ♻ ☆ Walking in the Shadow: A New Perspective on Descent Directions for + Constrained Minimization + + +
+ Descent directions such as movement towards Descent directions, including +movement towards Frank-Wolfe vertices, away-steps, in-face away-steps and +pairwise directions, have been an important design consideration in conditional +gradient descent (CGD) variants. In this work, we attempt to demystify the +impact of the movement in these directions towards attaining constrained +minimizers. The optimal local direction of descent is the directional +derivative (i.e., shadow) of the projection of the negative gradient. We show +that this direction is the best away-step possible, and the continuous-time +dynamics of moving in the shadow is equivalent to the dynamics of projected +gradient descent (PGD), although it's non-trivial to discretize. We also show +that Frank-Wolfe (FW) vertices correspond to projecting onto the polytope using +an "infinite" step in the direction of the negative gradient, thus providing a +new perspective on these steps. We combine these insights into a novel +Shadow-CG method that uses FW and shadow steps, while enjoying linear +convergence, with a rate that depends on the number of breakpoints in its +projection curve, rather than the pyramidal width. We provide a linear bound on +the number of breakpoints for simple polytopes and present scaling-invariant +upper bounds for general polytopes based on the number of facets. We exemplify +the benefit of using Shadow-CG computationally for various applications, while +raising an open question about tightening the bound on the number of +breakpoints for general polytopes. + +
+
+
+
+
+ + ♻ ☆ CartiMorph: a framework for automated knee articular cartilage + morphometrics + + +
+ We introduce CartiMorph, a framework for automated knee articular cartilage +morphometrics. It takes an image as input and generates quantitative metrics +for cartilage subregions, including the percentage of full-thickness cartilage +loss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the +power of deep learning models for hierarchical image feature representation. +Deep learning models were trained and validated for tissue segmentation, +template construction, and template-to-image registration. We established +methods for surface-normal-based cartilage thickness mapping, FCL estimation, +and rule-based cartilage parcellation. Our cartilage thickness map showed less +error in thin and peripheral regions. We evaluated the effectiveness of the +adopted segmentation model by comparing the quantitative metrics obtained from +model segmentation and those from manual segmentation. The root-mean-squared +deviation of the FCL measurements was less than 8%, and strong correlations +were observed for the mean thickness (Pearson's correlation coefficient $\rho +\in [0.82,0.97]$), surface area ($\rho \in [0.82,0.98]$) and volume ($\rho \in +[0.89,0.98]$) measurements. We compared our FCL measurements with those from a +previous study and found that our measurements deviated less from the ground +truths. We observed superior performance of the proposed rule-based cartilage +parcellation method compared with the atlas-based approach. CartiMorph has the +potential to promote imaging biomarkers discovery for knee osteoarthritis. + +
+
+ comment: To be published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Distributionally Robust Statistical Verification with Imprecise Neural + Networks + + +
+ A particularly challenging problem in AI safety is providing guarantees on +the behavior of high-dimensional autonomous systems. Verification approaches +centered around reachability analysis fail to scale, and purely statistical +approaches are constrained by the distributional assumptions about the sampling +process. Instead, we pose a distributionally robust version of the statistical +verification problem for black-box systems, where our performance guarantees +hold over a large family of distributions. This paper proposes a novel approach +based on a combination of active learning, uncertainty quantification, and +neural network verification. A central piece of our approach is an ensemble +technique called Imprecise Neural Networks, which provides the uncertainty to +guide active learning. The active learning uses an exhaustive neural-network +verification tool Sherlock to collect samples. An evaluation on multiple +physical simulators in the openAI gym Mujoco environments with +reinforcement-learned controllers demonstrates that our approach can provide +useful and scalable guarantees for high-dimensional systems. + +
+
+
+
+
+ + ♻ ☆ On progressive sharpening, flat minima and generalisation + + +
+ We present a new approach to understanding the relationship between loss +curvature and input-output model behaviour in deep learning. Specifically, we +use existing empirical analyses of the spectrum of deep network loss Hessians +to ground an ansatz tying together the loss Hessian and the input-output +Jacobian of a deep neural network over training samples throughout training. We +then prove a series of theoretical results which quantify the degree to which +the input-output Jacobian of a model approximates its Lipschitz norm over a +data distribution, and deduce a novel generalisation bound in terms of the +empirical Jacobian. We use our ansatz, together with our theoretical results, +to give a new account of the recently observed progressive sharpening +phenomenon, as well as the generalisation properties of flat minima. +Experimental evidence is provided to validate our claims. + +
+
+
+
+
+ + ♻ ☆ What You Hear Is What You See: Audio Quality Metrics From Image Quality + Metrics + + +
+ In this study, we investigate the feasibility of utilizing state-of-the-art +image perceptual metrics for evaluating audio signals by representing them as +spectrograms. The encouraging outcome of the proposed approach is based on the +similarity between the neural mechanisms in the auditory and visual pathways. +Furthermore, we customise one of the metrics which has a psychoacoustically +plausible architecture to account for the peculiarities of sound signals. We +evaluate the effectiveness of our proposed metric and several baseline metrics +using a music dataset, with promising results in terms of the correlation +between the metrics and the perceived quality of audio as rated by human +evaluators. + +
+
+
+
+
+ + ♻ ☆ Cancellation-Free Regret Bounds for Lagrangian Approaches in Constrained + Markov Decision Processes + + +
+ Constrained Markov Decision Processes (CMDPs) are one of the common ways to +model safe reinforcement learning problems, where constraint functions model +the safety objectives. Lagrangian-based dual or primal-dual algorithms provide +efficient methods for learning in CMDPs. For these algorithms, the currently +known regret bounds in the finite-horizon setting allow for a "cancellation of +errors"; one can compensate for a constraint violation in one episode with a +strict constraint satisfaction in another. However, we do not consider such a +behavior safe in practical applications. In this paper, we overcome this +weakness by proposing a novel model-based dual algorithm OptAug-CMDP for +tabular finite-horizon CMDPs. Our algorithm is motivated by the augmented +Lagrangian method and can be performed efficiently. We show that during $K$ +episodes of exploring the CMDP, our algorithm obtains a regret of +$\tilde{O}(\sqrt{K})$ for both the objective and the constraint violation. +Unlike existing Lagrangian approaches, our algorithm achieves this regret +without the need for the cancellation of errors. + +
+
+
+
+
+ + ♻ ☆ On the Consistency of Average Embeddings for Item Recommendation RecSys 2023 + + +
+ A prevalent practice in recommender systems consists in averaging item +embeddings to represent users or higher-level concepts in the same embedding +space. This paper investigates the relevance of such a practice. For this +purpose, we propose an expected precision score, designed to measure the +consistency of an average embedding relative to the items used for its +construction. We subsequently analyze the mathematical expression of this score +in a theoretical setting with specific assumptions, as well as its empirical +behavior on real-world data from music streaming services. Our results +emphasize that real-world averages are less consistent for recommendation, +which paves the way for future research to better align real-world embeddings +with assumptions from our theoretical setting. + +
+
+ comment: 17th ACM Conference on Recommender Systems (RecSys 2023) +
+
+
+
+
+ + ♻ ☆ Rule Generation for Classification: Scalability, Interpretability, and + Fairness + + +
+ We introduce a new rule-based optimization method for classification with +constraints. The proposed method leverages column generation for linear +programming, and hence, is scalable to large datasets. The resulting pricing +subproblem is shown to be NP-Hard. We recourse to a decision tree-based +heuristic and solve a proxy pricing subproblem for acceleration. The method +returns a set of rules along with their optimal weights indicating the +importance of each rule for learning. We address interpretability and fairness +by assigning cost coefficients to the rules and introducing additional +constraints. In particular, we focus on local interpretability and generalize +separation criterion in fairness to multiple sensitive attributes and classes. +We test the performance of the proposed methodology on a collection of datasets +and present a case study to elaborate on its different aspects. The proposed +rule-based learning method exhibits a good compromise between local +interpretability and fairness on the one side, and accuracy on the other side. + +
+
+
+
+
+ + ♻ ☆ Tensor train completion: local recovery guarantees via Riemannian + optimization + + +
+ In this work, we estimate the number of randomly selected elements of a +tensor that with high probability guarantees local convergence of Riemannian +gradient descent for tensor train completion. We derive a new bound for the +orthogonal projections onto the tangent spaces based on the harmonic mean of +the unfoldings' singular values and introduce a notion of core coherence for +tensor trains. We also extend the results to tensor train completion with +auxiliary subspace information and obtain the corresponding local convergence +guarantees. + +
+
+ comment: 1 figure added; Accepted version +
+
+
+
+
+ + ♻ ☆ Quantized Low-Rank Multivariate Regression with Random Dithering + + +
+ Low-rank multivariate regression (LRMR) is an important statistical learning +model that combines highly correlated tasks as a multiresponse regression +problem with low-rank priori on the coefficient matrix. In this paper, we study +quantized LRMR, a practical setting where the responses and/or the covariates +are discretized to finite precision. We focus on the estimation of the +underlying coefficient matrix. To make consistent estimator that could achieve +arbitrarily small error possible, we employ uniform quantization with random +dithering, i.e., we add appropriate random noise to the data before +quantization. Specifically, uniform dither and triangular dither are used for +responses and covariates, respectively. Based on the quantized data, we propose +the constrained Lasso and regularized Lasso estimators, and derive the +non-asymptotic error bounds. With the aid of dithering, the estimators achieve +minimax optimal rate, while quantization only slightly worsens the +multiplicative factor in the error rate. Moreover, we extend our results to a +low-rank regression model with matrix responses. We corroborate and demonstrate +our theoretical results via simulations on synthetic data or image restoration. + +
+
+ comment: 16 pages (Submitted) +
+
+
+
+
+ + ♻ ☆ MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with + Transformers + + +
+ Message Passing Interface (MPI) plays a crucial role in distributed memory +parallelization across multiple nodes. However, parallelizing MPI code +manually, and specifically, performing domain decomposition, is a challenging, +error-prone task. In this paper, we address this problem by developing +MPI-RICAL, a novel data-driven, programming-assistance tool that assists +programmers in writing domain decomposition based distributed memory +parallelization code. Specifically, we train a supervised language model to +suggest MPI functions and their proper locations in the code on the fly. We +also introduce MPICodeCorpus, the first publicly available corpus of MPI-based +parallel programs that is created by mining more than 15,000 open-source +repositories on GitHub. Experimental results have been done on MPICodeCorpus +and more importantly, on a compiled benchmark of MPI-based parallel programs +for numerical computations that represent real-world scientific applications. +MPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating +its accuracy in suggesting correct MPI functions at appropriate code +locations.. The source code used in this work, as well as other relevant +sources, are available at: +https://github.com/Scientific-Computing-Lab-NRCN/MPI-rical + +
+
+
+
+
+ + ♻ ☆ Modeling Moral Choices in Social Dilemmas with Multi-Agent Reinforcement + Learning IJCAI 2023 + + +
+ Practical uses of Artificial Intelligence (AI) in the real world have +demonstrated the importance of embedding moral choices into intelligent agents. +They have also highlighted that defining top-down ethical constraints on AI +according to any one type of morality is extremely challenging and can pose +risks. A bottom-up learning approach may be more appropriate for studying and +developing ethical behavior in AI agents. In particular, we believe that an +interesting and insightful starting point is the analysis of emergent behavior +of Reinforcement Learning (RL) agents that act according to a predefined set of +moral rewards in social dilemmas. + In this work, we present a systematic analysis of the choices made by +intrinsically-motivated RL agents whose rewards are based on moral theories. We +aim to design reward structures that are simplified yet representative of a set +of key ethical systems. Therefore, we first define moral reward functions that +distinguish between consequence- and norm-based agents, between morality based +on societal norms or internal virtues, and between single- and mixed-virtue +(e.g., multi-objective) methodologies. Then, we evaluate our approach by +modeling repeated dyadic interactions between learning moral agents in three +iterated social dilemma games (Prisoner's Dilemma, Volunteer's Dilemma and Stag +Hunt). We analyze the impact of different types of morality on the emergence of +cooperation, defection or exploitation, and the corresponding social outcomes. +Finally, we discuss the implications of these findings for the development of +moral agents in artificial and mixed human-AI societies. + +
+
+ comment: Accepted at IJCAI 2023 (32nd International Joint Conference on + Artificial Intelligence - Macao, S.A.R.) +
+
+
+
+
+ + ♻ ☆ NeXtQSM -- A complete deep learning pipeline for data-consistent + quantitative susceptibility mapping trained with hybrid data + + +
+ Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great +potential in recent years, obtaining similar results to established +non-learning approaches. Many current deep learning approaches are not data +consistent, require in vivo training data or solve the QSM problem in +consecutive steps resulting in the propagation of errors. Here we aim to +overcome these limitations and developed a framework to solve the QSM +processing steps jointly. We developed a new hybrid training data generation +method that enables the end-to-end training for solving background field +correction and dipole inversion in a data-consistent fashion using a +variational network that combines the QSM model term and a learned regularizer. +We demonstrate that NeXtQSM overcomes the limitations of previous deep learning +methods. NeXtQSM offers a new deep learning based pipeline for computing +quantitative susceptibility maps that integrates each processing step into the +training and provides results that are robust and fast. + +
+
+
+
+
+ + ♻ ☆ An exponentially-growing family of universal quantum circuits + + +
+ Quantum machine learning has become an area of growing interest but has +certain theoretical and hardware-specific limitations. Notably, the problem of +vanishing gradients, or barren plateaus, renders the training impossible for +circuits with high qubit counts, imposing a limit on the number of qubits that +data scientists can use for solving problems. Independently, angle-embedded +supervised quantum neural networks were shown to produce truncated Fourier +series with a degree directly dependent on two factors: the depth of the +encoding and the number of parallel qubits the encoding applied to. The degree +of the Fourier series limits the model expressivity. This work introduces two +new architectures whose Fourier degrees grow exponentially: the sequential and +parallel exponential quantum machine learning architectures. This is done by +efficiently using the available Hilbert space when encoding, increasing the +expressivity of the quantum encoding. Therefore, the exponential growth allows +staying at the low-qubit limit to create highly expressive circuits avoiding +barren plateaus. Practically, parallel exponential architecture was shown to +outperform the existing linear architectures by reducing their final mean +square error value by up to 44.7% in a one-dimensional test problem. +Furthermore, the feasibility of this technique was also shown on a trapped ion +quantum processing unit. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Exploring the Benefits of Visual Prompting in Differential Privacy ICCV 2023 + + +
+ Visual Prompting (VP) is an emerging and powerful technique that allows +sample-efficient adaptation to downstream tasks by engineering a well-trained +frozen source model. In this work, we explore the benefits of VP in +constructing compelling neural network classifiers with differential privacy +(DP). We explore and integrate VP into canonical DP training methods and +demonstrate its simplicity and efficiency. In particular, we discover that VP +in tandem with PATE, a state-of-the-art DP training method that leverages the +knowledge transfer from an ensemble of teachers, achieves the state-of-the-art +privacy-utility trade-off with minimum expenditure of privacy budget. Moreover, +we conduct additional experiments on cross-domain image classification with a +sufficient domain gap to further unveil the advantage of VP in DP. Lastly, we +also conduct extensive ablation studies to validate the effectiveness and +contribution of VP under DP consideration. Our code is available at +(https://github.com/EzzzLi/Prompt-PATE). + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DeltaNN: Assessing the Impact of Computational Environment Parameters on + the Performance of Image Recognition Models + + +
+ Image recognition tasks typically use deep learning and require enormous +processing power, thus relying on hardware accelerators like GPUs and TPUs for +fast, timely processing. Failure in real-time image recognition tasks can occur +due to sub-optimal mapping on hardware accelerators during model deployment, +which may lead to timing uncertainty and erroneous behavior. Mapping on +hardware accelerators is done using multiple software components like deep +learning frameworks, compilers, and device libraries, that we refer to as the +computational environment. Owing to the increased use of image recognition +tasks in safety-critical applications like autonomous driving and medical +imaging, it is imperative to assess their robustness to changes in the +computational environment, as the impact of parameters like deep learning +frameworks, compiler optimizations, and hardware devices on model performance +and correctness is not yet well understood. + In this paper we present a differential testing framework, DeltaNN, that +allows us to assess the impact of different computational environment +parameters on the performance of image recognition models during deployment, +post training. DeltaNN generates different implementations of a given image +recognition model for variations in environment parameters, namely, deep +learning frameworks, compiler optimizations and hardware devices and analyzes +differences in model performance as a result. Using DeltaNN, we conduct an +empirical study of robustness analysis of three popular image recognition +models using the ImageNet dataset. We report the impact in terms of +misclassifications and inference time differences across different settings. In +total, we observed up to 72% output label differences across deep learning +frameworks, and up to 81% unexpected performance degradation in terms of +inference time, when applying compiler optimizations. + +
+
+ comment: 11 pages, 10 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Laughing Matters: Introducing Laughing-Face Generation using Diffusion + Models + + +
+ Speech-driven animation has gained significant traction in recent years, with +current methods achieving near-photorealistic results. However, the field +remains underexplored regarding non-verbal communication despite evidence +demonstrating its importance in human interaction. In particular, generating +laughter sequences presents a unique challenge due to the intricacy and nuances +of this behaviour. This paper aims to bridge this gap by proposing a novel +model capable of generating realistic laughter sequences, given a still +portrait and an audio clip containing laughter. We highlight the failure cases +of traditional facial animation methods and leverage recent advances in +diffusion models to produce convincing laughter videos. We train our model on a +diverse set of laughter datasets and introduce an evaluation metric +specifically designed for laughter. When compared with previous speech-driven +approaches, our model achieves state-of-the-art performance across all metrics, +even when these are re-trained for laughter generation. Our code and project +are publicly available + +
+
+
+
+
+ + ♻ ☆ Fault Localization for Buggy Deep Learning Framework Conversions in + Image Recognition + + +
+ When deploying Deep Neural Networks (DNNs), developers often convert models +from one deep learning framework to another (e.g., TensorFlow to PyTorch). +However, this process is error-prone and can impact target model accuracy. To +identify the extent of such impact, we perform and briefly present a +differential analysis against three DNNs widely used for image recognition +(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep +learning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which +revealed numerous model crashes and output label discrepancies of up to 72%. To +mitigate such errors, we present a novel approach towards fault localization +and repair of buggy deep learning framework conversions, focusing on +pre-trained image recognition models. Our technique consists of four stages of +analysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters, +and 4) graph representation. In addition, we propose various strategies towards +fault repair of the faults detected. We implement our technique on top of the +Apache TVM deep learning compiler, and we test it by conducting a preliminary +fault localization analysis for the conversion of InceptionV3 from TF to +TFLite. Our approach detected a fault in a common DNN converter tool, which +introduced precision errors in weights, reducing model accuracy. After our +fault localization, we repaired the issue, reducing our conversion error to +zero. + +
+
+ comment: 5 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ How Good is Google Bard's Visual Understanding? An Empirical Study on + Open Challenges + + +
+ Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in +the field of conversational AI. Notably, Bard has recently been updated to +handle visual inputs alongside text prompts during conversations. Given Bard's +impressive track record in handling textual inputs, we explore its capabilities +in understanding and interpreting visual data (images) conditioned by text +questions. This exploration holds the potential to unveil new insights and +challenges for Bard and other forthcoming multi-modal Generative models, +especially in addressing complex computer vision problems that demand accurate +visual and language understanding. Specifically, in this study, we focus on 15 +diverse task scenarios encompassing regular, camouflaged, medical, under-water +and remote sensing data to comprehensively evaluate Bard's performance. Our +primary finding indicates that Bard still struggles in these vision scenarios, +highlighting the significant gap in vision-based understanding that needs to be +bridged in future developments. We expect that this empirical study will prove +valuable in advancing future models, leading to enhanced capabilities in +comprehending and interpreting fine-grained visual data. Our project is +released on https://github.com/htqin/GoogleBard-VisUnderstand + +
+
+
+
+
+ + ♻ ☆ Classifying World War II Era Ciphers with Machine Learning + + +
+ We determine the accuracy with which machine learning and deep learning +techniques can classify selected World War II era ciphers when only ciphertext +is available. The specific ciphers considered are Enigma, M-209, Sigaba, +Purple, and Typex. We experiment with three classic machine learning models, +namely, Support Vector Machines (SVM), $k$-Nearest Neighbors ($k$-NN), and +Random Forest (RF). We also experiment with four deep learning neural +network-based models: Multi-Layer Perceptrons (MLP), Long Short-Term Memory +(LSTM), Extreme Learning Machines (ELM), and Convolutional Neural Networks +(CNN). Each model is trained on features consisting of histograms, digrams, and +raw ciphertext letter sequences. Furthermore, the classification problem is +considered under four distinct scenarios: Fixed plaintext with fixed keys, +random plaintext with fixed keys, fixed plaintext with random keys, and random +plaintext with random keys. Under the most realistic scenario, given 1000 +characters per ciphertext, we are able to distinguish the ciphers with greater +than 97% accuracy. In addition, we consider the accuracy of a subset of the +learning techniques as a function of the length of the ciphertext messages. +Somewhat surprisingly, our classic machine learning models perform at least as +well as our deep learning models. We also find that ciphers that are more +similar in design are somewhat more challenging to distinguish, but not as +difficult as might be expected. + +
+
+
+
+
+ + ♻ ☆ HypLL: The Hyperbolic Learning Library + + +
+ Deep learning in hyperbolic space is quickly gaining traction in the fields +of machine learning, multimedia, and computer vision. Deep networks commonly +operate in Euclidean space, implicitly assuming that data lies on regular +grids. Recent advances have shown that hyperbolic geometry provides a viable +alternative foundation for deep learning, especially when data is hierarchical +in nature and when working with few embedding dimensions. Currently however, no +accessible open-source library exists to build hyperbolic network modules akin +to well-known deep learning libraries. We present HypLL, the Hyperbolic +Learning Library to bring the progress on hyperbolic deep learning together. +HypLL is built on top of PyTorch, with an emphasis in its design for +ease-of-use, in order to attract a broad audience towards this new and +open-ended research direction. The code is available at: +https://github.com/maxvanspengler/hyperbolic_learning_library. + +
+
+ comment: ACM Multimedia Open-Source Software Competition 2023 +
+
+
+
+
+ + ♻ ☆ Context-Aware Composition of Agent Policies by Markov Decision Process + Entity Embeddings and Agent Ensembles + + +
+ Computational agents support humans in many areas of life and are therefore +found in heterogeneous contexts. This means they operate in rapidly changing +environments and can be confronted with huge state and action spaces. In order +to perform services and carry out activities in a goal-oriented manner, agents +require prior knowledge and therefore have to develop and pursue +context-dependent policies. However, prescribing policies in advance is limited +and inflexible, especially in dynamically changing environments. Moreover, the +context of an agent determines its choice of actions. Since the environments +can be stochastic and complex in terms of the number of states and feasible +actions, activities are usually modelled in a simplified way by Markov decision +processes so that, e.g., agents with reinforcement learning are able to learn +policies, that help to capture the context and act accordingly to optimally +perform activities. However, training policies for all possible contexts using +reinforcement learning is time-consuming. A requirement and challenge for +agents is to learn strategies quickly and respond immediately in cross-context +environments and applications, e.g., the Internet, service robotics, +cyber-physical systems. In this work, we propose a novel simulation-based +approach that enables a) the representation of heterogeneous contexts through +knowledge graphs and entity embeddings and b) the context-aware composition of +policies on demand by ensembles of agents running in parallel. The evaluation +we conducted with the "Virtual Home" dataset indicates that agents with a need +to switch seamlessly between different contexts, can request on-demand composed +policies that lead to the successful completion of context-appropriate +activities without having to learn these policies in lengthy training steps and +episodes, in contrast to agents that use reinforcement learning. + +
+
+ comment: 30 pages, 11 figures, 9 tables, 3 listings, Re-submitted to Semantic + Web Journal, Currently, under review +
+
+
+
+
+ + ♻ ☆ Food Classification using Joint Representation of Visual and Textual + Data + + +
+ Food classification is an important task in health care. In this work, we +propose a multimodal classification framework that uses the modified version of +EfficientNet with the Mish activation function for image classification, and +the traditional BERT transformer-based network is used for text classification. +The proposed network and the other state-of-the-art methods are evaluated on a +large open-source dataset, UPMC Food-101. The experimental results show that +the proposed network outperforms the other methods, a significant difference of +11.57% and 6.34% in accuracy is observed for image and text classification, +respectively, when compared with the second-best performing method. We also +compared the performance in terms of accuracy, precision, and recall for text +classification using both machine learning and deep learning-based models. The +comparative analysis from the prediction results of both images and text +demonstrated the efficiency and robustness of the proposed approach. + +
+
+ comment: Updated results and discussions to be posted and some sections needed + to be expanded +
+
+
+
+
+ + ♻ ☆ Solving AC Power Flow with Graph Neural Networks under Realistic + Constraints + + +
+ In this paper, we propose a graph neural network architecture to solve the AC +power flow problem under realistic constraints. To ensure a safe and resilient +operation of distribution grids, AC power flow calculations are the means of +choice to determine grid operating limits or analyze grid asset utilization in +planning procedures. In our approach, we demonstrate the development of a +framework that uses graph neural networks to learn the physical constraints of +the power flow. We present our model architecture on which we perform +unsupervised training to learn a general solution of the AC power flow +formulation independent of the specific topologies and supply tasks used for +training. Finally, we demonstrate, validate and discuss our results on medium +voltage benchmark grids. In our approach, we focus on the physical and +topological properties of distribution grids to provide scalable solutions for +real grid topologies. Therefore, we take a data-driven approach, using large +and diverse data sets consisting of realistic grid topologies, for the +unsupervised training of the AC power flow graph neural network architecture +and compare the results to a prior neural architecture and the Newton-Raphson +method. Our approach shows a high increase in computation time and good +accuracy compared to state-of-the-art solvers. It also out-performs that neural +solver for power flow in terms of accuracy. + +
+
+
+
+
+ + ♻ ☆ Implicit neural representation for change detection + + +
+ Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained +during two distinct time periods over the same geographic region presents a +significant challenge due to the disparities in spatial coverage and the +presence of noise in the acquisition system. The most commonly used approaches +to detecting changes in point clouds are based on supervised methods which +necessitate extensive labelled data often unavailable in real-world +applications. To address these issues, we propose an unsupervised approach that +comprises two components: Implicit Neural Representation (INR) for continuous +shape reconstruction and a Gaussian Mixture Model for categorising changes. INR +offers a grid-agnostic representation for encoding bi-temporal point clouds, +with unmatched spatial support that can be regularised to enhance +high-frequency details and reduce noise. The reconstructions at each timestamp +are compared at arbitrary spatial scales, leading to a significant increase in +detection capabilities. We apply our method to a benchmark dataset comprising +simulated LiDAR point clouds for urban sprawling. This dataset encompasses +diverse challenging scenarios, varying in resolutions, input modalities and +noise levels. This enables a comprehensive multi-scenario evaluation, comparing +our method with the current state-of-the-art approach. We outperform the +previous methods by a margin of 10% in the intersection over union metric. In +addition, we put our techniques to practical use by applying them in a +real-world scenario to identify instances of illicit excavation of +archaeological sites and validate our results by comparing them with findings +from field experts. + +
+
+ comment: Main article is 10 pages + 6 pages of supplementary. Conference style + paper +
+
+
+
+
+ + ♻ ☆ E-MCTS: Deep Exploration in Model-Based Reinforcement Learning by + Planning with Epistemic Uncertainty NeurIPS 2023 + + +
+ One of the most well-studied and highly performing planning approaches used +in Model-Based Reinforcement Learning (MBRL) is Monte-Carlo Tree Search (MCTS). +Key challenges of MCTS-based MBRL methods remain dedicated deep exploration and +reliability in the face of the unknown, and both challenges can be alleviated +through principled epistemic uncertainty estimation in the predictions of MCTS. +We present two main contributions: First, we develop methodology to propagate +epistemic uncertainty in MCTS, enabling agents to estimate the epistemic +uncertainty in their predictions. Second, we utilize the propagated uncertainty +for a novel deep exploration algorithm by explicitly planning to explore. We +incorporate our approach into variations of MCTS-based MBRL approaches with +learned and provided dynamics models, and empirically show deep exploration +through successful epistemic uncertainty estimation achieved by our approach. +We compare to a non-planning-based deep-exploration baseline, and demonstrate +that planning with epistemic MCTS significantly outperforms non-planning based +exploration in the investigated deep exploration benchmark. + +
+
+ comment: Submitted to NeurIPS 2023, accepted to EWRL 2023 +
+
+
+
+
+ + ♻ ☆ What do neural networks learn in image classification? A frequency + shortcut perspective ICCV2023 + + +
+ Frequency analysis is useful for understanding the mechanisms of +representation learning in neural networks (NNs). Most research in this area +focuses on the learning dynamics of NNs for regression tasks, while little for +classification. This study empirically investigates the latter and expands the +understanding of frequency shortcuts. First, we perform experiments on +synthetic datasets, designed to have a bias in different frequency bands. Our +results demonstrate that NNs tend to find simple solutions for classification, +and what they learn first during training depends on the most distinctive +frequency characteristics, which can be either low- or high-frequencies. +Second, we confirm this phenomenon on natural images. We propose a metric to +measure class-wise frequency characteristics and a method to identify frequency +shortcuts. The results show that frequency shortcuts can be texture-based or +shape-based, depending on what best simplifies the objective. Third, we +validate the transferability of frequency shortcuts on out-of-distribution +(OOD) test sets. Our results suggest that frequency shortcuts can be +transferred across datasets and cannot be fully avoided by larger model +capacity and data augmentation. We recommend that future research should focus +on effective training schemes mitigating frequency shortcut learning. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Diffiner: A Versatile Diffusion-based Generative Refiner for Speech + Enhancement + + +
+ Although deep neural network (DNN)-based speech enhancement (SE) methods +outperform the previous non-DNN-based ones, they often degrade the perceptual +quality of generated outputs. To tackle this problem, we introduce a DNN-based +generative refiner, Diffiner, aiming to improve perceptual speech quality +pre-processed by an SE method. We train a diffusion-based generative model by +utilizing a dataset consisting of clean speech only. Then, our refiner +effectively mixes clean parts newly generated via denoising diffusion +restoration into the degraded and distorted parts caused by a preceding SE +method, resulting in refined speech. Once our refiner is trained on a set of +clean speech, it can be applied to various SE methods without additional +training specialized for each SE module. Therefore, our refiner can be a +versatile post-processing module w.r.t. SE methods and has high potential in +terms of modularity. Experimental results show that our method improved +perceptual speech quality regardless of the preceding SE methods used. + +
+
+ comment: Accepted by Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Deep neural networks on diffeomorphism groups for optimal shape + reparameterization + + +
+ One of the fundamental problems in shape analysis is to align curves or +surfaces before computing geodesic distances between their shapes. Finding the +optimal reparametrization realizing this alignment is a computationally +demanding task, typically done by solving an optimization problem on the +diffeomorphism group. In this paper, we propose an algorithm for constructing +approximations of orientation-preserving diffeomorphisms by composition of +elementary diffeomorphisms. The algorithm is implemented using PyTorch, and is +applicable for both unparametrized curves and surfaces. Moreover, we show +universal approximation properties for the constructed architectures, and +obtain bounds for the Lipschitz constants of the resulting diffeomorphisms. + +
+
+ comment: 36 pages, 11 figures. Accepted by BIT Numerical Mathematics, not yet + published +
+
+
+
+
+ + ♻ ☆ SignReLU neural network and its approximation ability + + +
+ Deep neural networks (DNNs) have garnered significant attention in various +fields of science and technology in recent years. Activation functions define +how neurons in DNNs process incoming signals for them. They are essential for +learning non-linear transformations and for performing diverse computations +among successive neuron layers. In the last few years, researchers have +investigated the approximation ability of DNNs to explain their power and +success. In this paper, we explore the approximation ability of DNNs using a +different activation function, called SignReLU. Our theoretical results +demonstrate that SignReLU networks outperform rational and ReLU networks in +terms of approximation performance. Numerical experiments are conducted +comparing SignReLU with the existing activations such as ReLU, Leaky ReLU, and +ELU, which illustrate the competitive practical performance of SignReLU. + +
+
+
+
+
+ + ♻ ☆ G-Signatures: Global Graph Propagation With Randomized Signatures + + +
+ Graph neural networks (GNNs) have evolved into one of the most popular deep +learning architectures. However, GNNs suffer from over-smoothing node +information and, therefore, struggle to solve tasks where global graph +properties are relevant. We introduce G-Signatures, a novel graph learning +method that enables global graph propagation via randomized signatures. +G-Signatures use a new graph conversion concept to embed graph structured +information which can be interpreted as paths in latent space. We further +introduce the idea of latent space path mapping. This allows us to iteratively +traverse latent space paths, and, thus globally process information. +G-Signatures excel at extracting and processing global graph properties, and +effectively scale to large graph problems. Empirically, we confirm the +advantages of G-Signatures at several classification and regression tasks. + +
+
+ comment: 7 pages (+ appendix); 4 figures +
+
+
+
+
+ + ♻ ☆ Alien Coding + + +
+ We introduce a self-learning algorithm for synthesizing programs for OEIS +sequences. The algorithm starts from scratch initially generating programs at +random. Then it runs many iterations of a self-learning loop that interleaves +(i) training neural machine translation to learn the correspondence between +sequences and the programs discovered so far, and (ii) proposing many new +programs for each OEIS sequence by the trained neural machine translator. The +algorithm discovers on its own programs for more than 78000 OEIS sequences, +sometimes developing unusual programming methods. We analyze its behavior and +the invented programs in several experiments. + +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ♻ ☆ Case-Aware Adversarial Training + + +
+ The neural network (NN) becomes one of the most heated type of models in +various signal processing applications. However, NNs are extremely vulnerable +to adversarial examples (AEs). To defend AEs, adversarial training (AT) is +believed to be the most effective method while due to the intensive +computation, AT is limited to be applied in most applications. In this paper, +to resolve the problem, we design a generic and efficient AT improvement +scheme, namely case-aware adversarial training (CAT). Specifically, the +intuition stems from the fact that a very limited part of informative samples +can contribute to most of model performance. Alternatively, if only the most +informative AEs are used in AT, we can lower the computation complexity of AT +significantly as maintaining the defense effect. To achieve this, CAT achieves +two breakthroughs. First, a method to estimate the information degree of +adversarial examples is proposed for AE filtering. Second, to further enrich +the information that the NN can obtain from AEs, CAT involves a weight +estimation and class-level balancing based sampling strategy to increase the +diversity of AT at each iteration. Extensive experiments show that CAT is +faster than vanilla AT by up to 3x while achieving competitive defense effect. + +
+
+
+
+
+ + ♻ ☆ Is Complexity Required for Neural Network Pruning? A Case Study on + Global Magnitude Pruning + + +
+ Pruning neural networks has become popular in the last decade when it was +shown that a large number of weights can be safely removed from modern neural +networks without compromising accuracy. Numerous pruning methods have been +proposed since then, each claiming to be better than the previous. Many +state-of-the-art (SOTA) techniques today rely on complex pruning methodologies +utilizing importance scores, getting feedback through back-propagation or +having heuristics-based pruning rules amongst others. In this work, we question +whether this pattern of introducing complexity is really necessary to achieve +better pruning results. We benchmark these SOTA techniques against a naive +pruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks +weights in order of their magnitudes and prunes the smallest ones. Hence, in +its vanilla form, it is one of the simplest pruning techniques. Surprisingly, +we find that vanilla Global MP outperforms all the other SOTA techniques and +achieves a new SOTA result. It also achieves promising performance on FLOPs +sparsification, which we find is enhanced, when pruning is conducted in a +gradual fashion. We also find that Global MP is generalizable across tasks, +datasets, and models with superior performance. Moreover, a common issue that +many pruning algorithms run into at high sparsity rates, namely, +layer-collapse, can be easily fixed in Global MP by setting a minimum threshold +of weights to be retained in each layer. Lastly, unlike many other SOTA +techniques, Global MP does not require any additional algorithm specific +hyper-parameters and is very straightforward to tune and implement. We showcase +our findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1 +and FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is +available at https://github.com/manasgupta-1/GlobalMP. + +
+
+
+
+
+ + ♻ ☆ Geometric Algebra Transformers + + +
+ Problems involving geometric data arise in physics, chemistry, robotics, +computer vision, and many other fields. Such data can take numerous forms, such +as points, direction vectors, translations, or rotations, but to date there is +no single architecture that can be applied to such a wide variety of geometric +types while respecting their symmetries. In this paper we introduce the +Geometric Algebra Transformer (GATr), a general-purpose architecture for +geometric data. GATr represents inputs, outputs, and hidden states in the +projective geometric (or Clifford) algebra, which offers an efficient +16-dimensional vector-space representation of common geometric objects as well +as operators acting on them. GATr is equivariant with respect to E(3), the +symmetry group of 3D Euclidean space. As a Transformer, GATr is versatile, +efficient, and scalable. We demonstrate GATr in problems from n-body modeling +to wall-shear-stress estimation on large arterial meshes to robotic motion +planning. GATr consistently outperforms both non-geometric and equivariant +baselines in terms of error, data efficiency, and scalability. + +
+
+ comment: v2: more experiments, more baselines +
+
+
+
+
+ + ♻ ☆ Is Bio-Inspired Learning Better than Backprop? Benchmarking Bio Learning + vs. Backprop + + +
+ Bio-inspired learning has been gaining popularity recently given that +Backpropagation (BP) is not considered biologically plausible. Many algorithms +have been proposed in the literature which are all more biologically plausible +than BP. However, apart from overcoming the biological implausibility of BP, a +strong motivation for using Bio-inspired algorithms remains lacking. In this +study, we undertake a holistic comparison of BP vs. multiple Bio-inspired +algorithms to answer the question of whether Bio-learning offers additional +benefits over BP. We test Bio-algorithms under different design choices such as +access to only partial training data, resource constraints in terms of the +number of training epochs, sparsification of the neural network parameters and +addition of noise to input samples. Through these experiments, we notably find +two key advantages of Bio-algorithms over BP. Firstly, Bio-algorithms perform +much better than BP when the entire training dataset is not supplied. Four of +the five Bio-algorithms tested outperform BP by upto 5% accuracy when only 20% +of the training dataset is available. Secondly, even when the full dataset is +available, Bio-algorithms learn much quicker and converge to a stable accuracy +in far lesser training epochs than BP. Hebbian learning, specifically, is able +to learn in just 5 epochs compared to around 100 epochs required by BP. These +insights present practical reasons for utilising Bio-learning beyond just their +biological plausibility and also point towards interesting new directions for +future work on Bio-learning. + +
+
+
+
+
+ + ♻ ☆ Control Theoretic Analysis of Temporal Difference Learning + + +
+ The goal of this manuscript is to conduct a controltheoretic analysis of +Temporal Difference (TD) learning algorithms. TD-learning serves as a +cornerstone in the realm of reinforcement learning, offering a methodology for +approximating the value function associated with a given policy in a Markov +Decision Process. Despite several existing works that have contributed to the +theoretical understanding of TD-learning, it is only in recent years that +researchers have been able to establish concrete guarantees on its statistical +efficiency. In this paper, we introduce a finite-time, control-theoretic +framework for analyzing TD-learning, leveraging established concepts from the +field of linear systems control. Consequently, this paper provides additional +insights into the mechanics of TD learning and the broader landscape of +reinforcement learning, all while employing straightforward analytical tools +derived from control theory. + +
+
+
+
+
+ + ♻ ☆ BCGGAN: Ballistocardiogram artifact removal in simultaneous EEG-fMRI + using generative adversarial network + + +
+ Due to its advantages of high temporal and spatial resolution, the technology +of simultaneous electroencephalogram-functional magnetic resonance imaging +(EEG-fMRI) acquisition and analysis has attracted much attention, and has been +widely used in various research fields of brain science. However, during the +fMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate +the EEG. As an unpaired problem, BCG artifact removal now remains a +considerable challenge. Aiming to provide a solution, this paper proposed a +novel modular generative adversarial network (GAN) and corresponding training +strategy to improve the network performance by optimizing the parameters of +each module. In this manner, we hope to improve the local representation +ability of the network model, thereby improving its overall performance and +obtaining a reliable generator for BCG artifact removal. Moreover, the proposed +method does not rely on additional reference signal or complex hardware +equipment. Experimental results show that, compared with multiple methods, the +technique presented in this paper can remove the BCG artifact more effectively +while retaining essential EEG information. + +
+
+
+
+
+ + ♻ ☆ Estimating 3D Dental Structures using Simulated Panoramic Radiographs + and Neural Ray Tracing + + +
+ Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality +for dental examination. However, PX only provides a flattened 2D image, lacking +in a 3D view of the oral structure. In this paper, we propose a framework to +estimate 3D oral structures from real-world PX. Our framework tackles full 3D +reconstruction for varying subjects (patients) where each reconstruction is +based only on a single panoramic image. We create an intermediate +representation called simulated PX (SimPX) from 3D Cone-beam computed +tomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and +rotational principles of PX imaging. SimPX aims at not only truthfully +simulating PX, but also facilitates the reverting process back to 3D data. We +propose a novel neural model based on ray tracing which exploits both global +and local input features to convert SimPX to 3D output. At inference, a real PX +image is translated to a SimPX-style image with semantic regularization, and +the translated image is processed by generation module to produce high-quality +outputs. Experiments show that our method outperforms prior state-of-the-art in +reconstruction tasks both quantitatively and qualitatively. Unlike prior +methods, Our method does not require any prior information such as the shape of +dental arches, nor the matched PX-CBCT dataset for training, which is difficult +to obtain in clinical practice. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Pre-trained transformer for adversarial purification + + +
+ With more and more deep neural networks being deployed as various daily +services, their reliability is essential. It's frightening that deep neural +networks are vulnerable and sensitive to adversarial attacks, the most common +one of which for the services is evasion-based. Recent works usually strengthen +the robustness by adversarial training or leveraging the knowledge of an amount +of clean data. However, in practical terms, retraining and redeploying the +model need a large computational budget, leading to heavy losses to the online +service. In addition, when adversarial examples of a certain attack are +detected, only limited adversarial examples are available for the service +provider, while much clean data may not be accessible. Given the mentioned +problems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is +to rapidly defend against a certain attack for the frozen original service +model with limitations of few clean and adversarial examples. Motivated by the +generalization and the universal computation ability of pre-trained transformer +models, we come up with a new defender method, CeTaD, which stands for +Considering Pre-trained Transformers as Defenders. In particular, we evaluate +the effectiveness and the transferability of CeTaD in the case of one-shot +adversarial examples and explore the impact of different parts of CeTaD as well +as training data conditions. CeTaD is flexible, able to be embedded into an +arbitrary differentiable model, and suitable for various types of attacks. + +
+
+
+
+
+ + ♻ ☆ Assessing Hidden Risks of LLMs: An Empirical Study on Robustness, + Consistency, and Credibility + + +
+ The recent popularity of large language models (LLMs) has brought a +significant impact to boundless fields, particularly through their open-ended +ecosystem such as the APIs, open-sourced models, and plugins. However, with +their widespread deployment, there is a general lack of research that +thoroughly discusses and analyzes the potential risks concealed. In that case, +we intend to conduct a preliminary but pioneering study covering the +robustness, consistency, and credibility of LLMs systems. With most of the +related literature in the era of LLM uncharted, we propose an automated +workflow that copes with an upscaled number of queries/responses. Overall, we +conduct over a million queries to the mainstream LLMs including ChatGPT, LLaMA, +and OPT. Core to our workflow consists of a data primitive, followed by an +automated interpreter that evaluates these LLMs under different adversarial +metrical systems. As a result, we draw several, and perhaps unfortunate, +conclusions that are quite uncommon from this trendy community. Briefly, they +are: (i)-the minor but inevitable error occurrence in the user-generated query +input may, by chance, cause the LLM to respond unexpectedly; (ii)-LLMs possess +poor consistency when processing semantically similar query input. In addition, +as a side finding, we find that ChatGPT is still capable to yield the correct +answer even when the input is polluted at an extreme level. While this +phenomenon demonstrates the powerful memorization of the LLMs, it raises +serious concerns about using such data for LLM-involved evaluation in academic +development. To deal with it, we propose a novel index associated with a +dataset that roughly decides the feasibility of using such data for +LLM-involved evaluation. Extensive empirical studies are tagged to support the +aforementioned claims. + +
+
+
+
+
+ + ♻ ☆ Automatically Correcting Large Language Models: Surveying the landscape + of diverse self-correction strategies + + +
+ Large language models (LLMs) have demonstrated remarkable performance across +a wide array of NLP tasks. However, their efficacy is undermined by undesired +and inconsistent behaviors, including hallucination, unfaithful reasoning, and +toxic content. A promising approach to rectify these flaws is self-correction, +where the LLM itself is prompted or guided to fix problems in its own output. +Techniques leveraging automated feedback -- either produced by the LLM itself +or some external system -- are of particular interest as they are a promising +way to make LLM-based solutions more practical and deployable with minimal +human feedback. This paper presents a comprehensive review of this emerging +class of techniques. We analyze and taxonomize a wide array of recent work +utilizing these strategies, including training-time, generation-time, and +post-hoc correction. We also summarize the major applications of this strategy +and conclude by discussing future directions and challenges. + +
+
+ comment: Work in Progress. Version 2 +
+
+
+
+
+ + ♻ ☆ WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant + Analysis ICCV 2023 + + +
+ Deep neural networks are susceptible to generating overconfident yet +erroneous predictions when presented with data beyond known concepts. This +challenge underscores the importance of detecting out-of-distribution (OOD) +samples in the open world. In this work, we propose a novel feature-space OOD +detection score based on class-specific and class-agnostic information. +Specifically, the approach utilizes Whitened Linear Discriminant Analysis to +project features into two subspaces - the discriminative and residual subspaces +- for which the in-distribution (ID) classes are maximally separated and +closely clustered, respectively. The OOD score is then determined by combining +the deviation from the input data to the ID pattern in both subspaces. The +efficacy of our method, named WDiscOOD, is verified on the large-scale +ImageNet-1k benchmark, with six OOD datasets that cover a variety of +distribution shifts. WDiscOOD demonstrates superior performance on deep +classifiers with diverse backbone architectures, including CNN and vision +transformer. Furthermore, we also show that WDiscOOD more effectively detects +novel concepts in representation spaces trained with contrastive objectives, +including supervised contrastive loss and multi-modality contrastive loss. + +
+
+ comment: Accepted by ICCV 2023. Code is available at: + https://github.com/ivalab/WDiscOOD.git +
+
+
+
+
+ + ♻ ☆ Dataflow Analysis-Inspired Deep Learning for Efficient Vulnerability + Detection ICSE 2024 + + +
+ Deep learning-based vulnerability detection has shown great performance and, +in some studies, outperformed static analysis tools. However, the +highest-performing approaches use token-based transformer models, which are not +the most efficient to capture code semantics required for vulnerability +detection. Classical program analysis techniques such as dataflow analysis can +detect many types of bugs based on their root causes. In this paper, we propose +to combine such causal-based vulnerability detection algorithms with deep +learning, aiming to achieve more efficient and effective vulnerability +detection. Specifically, we designed DeepDFA, a dataflow analysis-inspired +graph learning framework and an embedding technique that enables graph learning +to simulate dataflow computation. We show that DeepDFA is both performant and +efficient. DeepDFA outperformed all non-transformer baselines. It was trained +in 9 minutes, 75x faster than the highest-performing baseline model. When using +only 50+ vulnerable and several hundreds of total examples as training data, +the model retained the same performance as 100% of the dataset. DeepDFA also +generalized to real-world vulnerabilities in DBGBench; it detected 8.7 out of +17 vulnerabilities on average across folds and was able to distinguish between +patched and buggy versions, while the highest-performing baseline models did +not detect any vulnerabilities. By combining DeepDFA with a large language +model, we surpassed the state-of-the-art vulnerability detection performance on +the Big-Vul dataset with 96.46 F1 score, 97.82 precision, and 95.14 recall. Our +replication package is located at https://figshare.com/s/e7953b4d345b00990d17. + +
+
+ comment: 11 pages, 9 figures. Accepted as a conference paper at ICSE 2024 +
+
+
+
+
+ + ♻ ☆ Toward Generalizable Machine Learning Models in Speech, Language, and + Hearing Sciences: Sample Size Estimation and Reducing Overfitting + + +
+ This study's first purpose is to provide quantitative evidence that would +incentivize researchers to instead use the more robust method of nested +cross-validation. The second purpose is to present methods and MATLAB codes for +doing power analysis for ML-based analysis during the design of a study. Monte +Carlo simulations were used to quantify the interactions between the employed +cross-validation method, the discriminative power of features, the +dimensionality of the feature space, and the dimensionality of the model. Four +different cross-validations (single holdout, 10-fold, train-validation-test, +and nested 10-fold) were compared based on the statistical power and +statistical confidence of the ML models. Distributions of the null and +alternative hypotheses were used to determine the minimum required sample size +for obtaining a statistically significant outcome ({\alpha}=0.05, +1-\b{eta}=0.8). Statistical confidence of the model was defined as the +probability of correct features being selected and hence being included in the +final model. Our analysis showed that the model generated based on the single +holdout method had very low statistical power and statistical confidence and +that it significantly overestimated the accuracy. Conversely, the nested +10-fold cross-validation resulted in the highest statistical confidence and the +highest statistical power, while providing an unbiased estimate of the +accuracy. The required sample size with a single holdout could be 50% higher +than what would be needed if nested cross-validation were used. Confidence in +the model based on nested cross-validation was as much as four times higher +than the confidence in the single holdout-based model. A computational model, +MATLAB codes, and lookup tables are provided to assist researchers with +estimating the sample size during the design of their future studies. + +
+
+ comment: Under review at JSLHR +
+
+
+
+
+ + ♻ ☆ Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals + + +
+ We consider the problem of sampling from a distribution governed by a +potential function. This work proposes an explicit score-based MCMC method that +is deterministic, resulting in a deterministic evolution for particles rather +than a stochastic differential equation evolution. The score term is given in +closed form by a regularized Wasserstein proximal, using a kernel convolution +that is approximated by sampling. We demonstrate fast convergence on various +problems and show improved dimensional dependence of mixing time bounds for the +case of Gaussian distributions compared to the unadjusted Langevin algorithm +(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally +derive closed form expressions for the distributions at each iterate for +quadratic potential functions, characterizing the variance reduction. Empirical +results demonstrate that the particles behave in an organized manner, lying on +level set contours of the potential. Moreover, the posterior mean estimator of +the proposed method is shown to be closer to the maximum a-posteriori estimator +compared to ULA and MALA, in the context of Bayesian logistic regression. + +
+
+
+
+
+ + ♻ ☆ Evolutionary Reinforcement Learning: A Survey + + +
+ Reinforcement learning (RL) is a machine learning approach that trains agents +to maximize cumulative rewards through interactions with environments. The +integration of RL with deep learning has recently resulted in impressive +achievements in a wide range of challenging tasks, including board games, +arcade games, and robot control. Despite these successes, there remain several +crucial challenges, including brittle convergence properties caused by +sensitive hyperparameters, difficulties in temporal credit assignment with long +time horizons and sparse rewards, a lack of diverse exploration, especially in +continuous search space scenarios, difficulties in credit assignment in +multi-agent reinforcement learning, and conflicting objectives for rewards. +Evolutionary computation (EC), which maintains a population of learning agents, +has demonstrated promising performance in addressing these limitations. This +article presents a comprehensive survey of state-of-the-art methods for +integrating EC into RL, referred to as evolutionary reinforcement learning +(EvoRL). We categorize EvoRL methods according to key research fields in RL, +including hyperparameter optimization, policy search, exploration, reward +shaping, meta-RL, and multi-objective RL. We then discuss future research +directions in terms of efficient methods, benchmarks, and scalable platforms. +This survey serves as a resource for researchers and practitioners interested +in the field of EvoRL, highlighting the important challenges and opportunities +for future research. With the help of this survey, researchers and +practitioners can develop more efficient methods and tailored benchmarks for +EvoRL, further advancing this promising cross-disciplinary research field. + +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Coagent Networks Revisited + + +
+ Coagent networks formalize the concept of arbitrary networks of stochastic +agents that collaborate to take actions in a reinforcement learning +environment. Prominent examples of coagent networks in action include +approaches to hierarchical reinforcement learning (HRL), such as those using +options, which attempt to address the exploration exploitation trade-off by +introducing abstract actions at different levels by sequencing multiple +stochastic networks within the HRL agents. We first provide a unifying +perspective on the many diverse examples that fall under coagent networks. We +do so by formalizing the rules of execution in a coagent network, enabled by +the novel and intuitive idea of execution paths in a coagent network. Motivated +by parameter sharing in the hierarchical option-critic architecture, we revisit +the coagent network theory and achieve a much shorter proof of the policy +gradient theorem using our idea of execution paths, without any assumption on +how parameters are shared among coagents. We then generalize our setting and +proof to include the scenario where coagents act asynchronously. This new +perspective and theorem also lead to more mathematically accurate and +performant algorithms than those in the existing literature. Lastly, by running +nonstationary RL experiments, we survey the performance and properties of +different generalizations of option-critic models. + +
+
+ comment: Reformatted paper significantly and clarified results on the + asynchronous case +
+
+
+
+
+ + ♻ ☆ Towards Generalist Robots: A Promising Paradigm via Generative + Simulation + + +
+ This document serves as a position paper that outlines the authors' vision +for a potential pathway towards generalist robots. The purpose of this document +is to share the excitement of the authors with the community and highlight a +promising research direction in robotics and AI. The authors believe the +proposed paradigm is a feasible path towards accomplishing the long-standing +goal of robotics research: deploying robots, or embodied AI agents more +broadly, in various non-factory real-world settings to perform diverse tasks. +This document presents a specific idea for mining knowledge in the latest +large-scale foundation models for robotics research. Instead of directly using +or adapting these models to produce low-level policies and actions, it +advocates for a fully automated generative pipeline (termed as generative +simulation), which uses these models to generate diversified tasks, scenes and +training supervisions at scale, thereby scaling up low-level skill learning and +ultimately leading to a foundation model for robotics that empowers generalist +robots. The authors are actively pursuing this direction, but in the meantime, +they recognize that the ambitious goal of building generalist robots with +large-scale policy training demands significant resources such as computing +power and hardware, and research groups in academia alone may face severe +resource constraints in implementing the entire vision. Therefore, the authors +believe sharing their thoughts at this early stage could foster discussions, +attract interest towards the proposed pathway and related topics from industry +groups, and potentially spur significant technical advancements in the field. + +
+
+
+
+
+ + ♻ ☆ Mixed-type Distance Shrinkage and Selection for Clustering via Kernel + Metric Learning + + +
+ Distance-based clustering and classification are widely used in various +fields to group mixed numeric and categorical data. In many algorithms, a +predefined distance measurement is used to cluster data points based on their +dissimilarity. While there exist numerous distance-based measures for data with +pure numerical attributes and several ordered and unordered categorical +metrics, an efficient and accurate distance for mixed-type data that utilizes +the continuous and discrete properties simulatenously is an open problem. Many +metrics convert numerical attributes to categorical ones or vice versa. They +handle the data points as a single attribute type or calculate a distance +between each attribute separately and add them up. We propose a metric called +KDSUM that uses mixed kernels to measure dissimilarity, with cross-validated +optimal bandwidth selection. We demonstrate that KDSUM is a shrinkage method +from existing mixed-type metrics to a uniform dissimilarity metric, and +improves clustering accuracy when utilized in existing distance-based +clustering algorithms on simulated and real-world datasets containing +continuous-only, categorical-only, and mixed-type data. + +
+
+ comment: 38 pages, 3 tables, 8 figures +
+
+
+
+
+ + ♻ ☆ Flexible Phase Dynamics for Bio-Plausible Contrastive Learning ICML + + +
+ Many learning algorithms used as normative models in neuroscience or as +candidate approaches for learning on neuromorphic chips learn by contrasting +one set of network states with another. These Contrastive Learning (CL) +algorithms are traditionally implemented with rigid, temporally non-local, and +periodic learning dynamics that could limit the range of physical systems +capable of harnessing CL. In this study, we build on recent work exploring how +CL might be implemented by biological or neurmorphic systems and show that this +form of learning can be made temporally local, and can still function even if +many of the dynamical requirements of standard training procedures are relaxed. +Thanks to a set of general theorems corroborated by numerical experiments +across several CL models, our results provide theoretical foundations for the +study and development of CL methods for biological and neuromorphic neural +networks. + +
+
+ comment: 23 pages, 4 figures. Paper accepted to ICML and update includes + changes made based on reviewer feedback +
+
+
+
+
+ + ♻ ☆ GRASP: A Goodness-of-Fit Test for Classification Learning + + +
+ Performance of classifiers is often measured in terms of average accuracy on +test data. Despite being a standard measure, average accuracy fails in +characterizing the fit of the model to the underlying conditional law of labels +given the features vector ($Y|X$), e.g. due to model misspecification, over +fitting, and high-dimensionality. In this paper, we consider the fundamental +problem of assessing the goodness-of-fit for a general binary classifier. Our +framework does not make any parametric assumption on the conditional law $Y|X$, +and treats that as a black box oracle model which can be accessed only through +queries. We formulate the goodness-of-fit assessment problem as a tolerance +hypothesis testing of the form \[ H_0: \mathbb{E}\Big[D_f\Big({\sf +Bern}(\eta(X))\|{\sf Bern}(\hat{\eta}(X))\Big)\Big]\leq \tau\,, \] where $D_f$ +represents an $f$-divergence function, and $\eta(x)$, $\hat{\eta}(x)$ +respectively denote the true and an estimate likelihood for a feature vector +$x$ admitting a positive label. We propose a novel test, called \grasp for +testing $H_0$, which works in finite sample settings, no matter the features +(distribution-free). We also propose model-X \grasp designed for model-X +settings where the joint distribution of the features vector is known. Model-X +\grasp uses this distributional information to achieve better power. We +evaluate the performance of our tests through extensive numerical experiments. + +
+
+ comment: 54 pages, 4 tables and 5 figures +
+
+
+
+
+ + ♻ ☆ Inferring Traffic Models in Terminal Airspace from Flight Tracks and + Procedures + + +
+ Realistic aircraft trajectory models are useful in the design and validation +of air traffic management (ATM) systems. Models of aircraft operated under +instrument flight rules (IFR) require capturing the variability inherent in how +aircraft follow standard flight procedures. The variability in aircraft +behavior varies among flight stages. In this paper, we propose a probabilistic +model that can learn the variability from the procedural data and flight tracks +collected from radar surveillance data. For each segment, a Gaussian mixture +model is used to learn the deviations of aircraft trajectories from their +procedures. Given new procedures, we can generate synthetic trajectories by +sampling a series of deviations from the trained Gaussian distributions and +reconstructing the aircraft trajectory using the deviations and the procedures. +We extend this method to capture pairwise correlations between aircraft and +show how a pairwise model can be used to generate traffic involving an +arbitrary number of aircraft. We demonstrate the proposed models on the arrival +tracks and procedures of the John F. Kennedy International Airport. The +distributional similarity between the original and the synthetic trajectory +dataset was evaluated using the Jensen-Shannon divergence between the empirical +distributions of different variables. We also provide qualitative analyses of +the synthetic trajectories generated from the models. + +
+
+
+
+
+ + ♻ ☆ Pre-Training Representations of Binary Code Using Contrastive Learning + + +
+ Compiled software is delivered as executable binary code. Developers write +source code to express the software semantics, but the compiler converts it to +a binary format that the CPU can directly execute. Therefore, binary code +analysis is critical to applications in reverse engineering and computer +security tasks where source code is not available. However, unlike source code +and natural language that contain rich semantic information, binary code is +typically difficult for human engineers to understand and analyze. While +existing work uses AI models to assist source code analysis, few studies have +considered binary code. In this paper, we propose a COntrastive learning Model +for Binary cOde Analysis, or COMBO, that incorporates source code and comment +information into binary code during representation learning. Specifically, we +present three components in COMBO: (1) a primary contrastive learning method +for cold-start pre-training, (2) a simplex interpolation method to incorporate +source code, comments, and binary code, and (3) an intermediate representation +learning algorithm to provide binary code embeddings. Finally, we evaluate the +effectiveness of the pre-trained representations produced by COMBO using three +indicative downstream tasks relating to binary code: algorithmic functionality +classification, binary code similarity, and vulnerability detection. Our +experimental results show that COMBO facilitates representation learning of +binary code visualized by distribution analysis, and improves the performance +on all three downstream tasks by 5.45% on average compared to state-of-the-art +large-scale language representation models. To the best of our knowledge, COMBO +is the first language representation model that incorporates source code, +binary code, and comments into contrastive code representation learning and +unifies multiple tasks for binary code analysis. + +
+
+
+
+
+ + ♻ ☆ System identification of neural systems: If we got it right, would we + know? + + +
+ Artificial neural networks are being proposed as models of parts of the +brain. The networks are compared to recordings of biological neurons, and good +performance in reproducing neural responses is considered to support the +model's validity. A key question is how much this system identification +approach tells us about brain computation. Does it validate one model +architecture over another? We evaluate the most commonly used comparison +techniques, such as a linear encoding model and centered kernel alignment, to +correctly identify a model by replacing brain recordings with known ground +truth models. System identification performance is quite variable; it also +depends significantly on factors independent of the ground truth architecture, +such as stimuli images. In addition, we show the limitations of using +functional similarity scores in identifying higher-level architectural motifs. + +
+
+
+
+
+ + ♻ ☆ Hypernetwork approach to Bayesian MAML + + +
+ The main goal of Few-Shot learning algorithms is to enable learning from +small amounts of data. One of the most popular and elegant Few-Shot learning +approaches is Model-Agnostic Meta-Learning (MAML). The main idea behind this +method is to learn the shared universal weights of a meta-model, which are then +adapted for specific tasks. However, the method suffers from over-fitting and +poorly quantifies uncertainty due to limited data size. Bayesian approaches +could, in principle, alleviate these shortcomings by learning weight +distributions in place of point-wise weights. Unfortunately, previous +modifications of MAML are limited due to the simplicity of Gaussian posteriors, +MAML-like gradient-based weight updates, or by the same structure enforced for +universal and adapted weights. + In this paper, we propose a novel framework for Bayesian MAML called +BayesianHMAML, which employs Hypernetworks for weight updates. It learns the +universal weights point-wise, but a probabilistic structure is added when +adapted for specific tasks. In such a framework, we can use simple Gaussian +distributions or more complicated posteriors induced by Continuous Normalizing +Flows. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2205.15745 +
+
+
+
+
+ + ♻ ☆ xxMD: Benchmarking Neural Force Fields Using Extended Dynamics beyond + Equilibrium + + +
+ Neural force fields (NFFs) have gained prominence in computational chemistry +as surrogate models, superseding quantum-chemistry calculations in ab initio +molecular dynamics. The prevalent benchmark for NFFs has been the MD17 dataset +and its subsequent extension. These datasets predominantly comprise geometries +from the equilibrium region of the ground electronic state potential energy +surface, sampling from direct adiabatic dynamics. However, many chemical +reactions entail significant molecular deformations, notably bond breaking. We +demonstrate the constrained distribution of internal coordinates and energies +in the MD17 datasets, underscoring their inadequacy for representing systems +undergoing chemical reactions. Addressing this sampling limitation, we +introduce the xxMD (Extended Excited-state Molecular Dynamics) dataset, derived +from non-adiabatic dynamics. This dataset encompasses energies and forces +ascertained from both multireference wave function theory and density +functional theory. Furthermore, its nuclear configuration spaces authentically +depict chemical reactions, making xxMD a more chemically relevant dataset. Our +re-assessment of equivariant models on the xxMD datasets reveals notably higher +mean absolute errors than those reported for MD17 and its variants. This +observation underscores the challenges faced in crafting a generalizable NFF +model with extrapolation capability. Our proposed xxMD-CASSCF and xxMD-DFT +datasets are available at https://github.com/zpengmei/xxMD. + +
+
+ comment: 19 pages, many figures. Data available at + https://github.com/zpengmei/xxMD +
+
+
+
+
+ + ♻ ☆ Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models ICCV 2023 + + +
+ Despite tremendous progress in generating high-quality images using diffusion +models, synthesizing a sequence of animated frames that are both photorealistic +and temporally coherent is still in its infancy. While off-the-shelf +billion-scale datasets for image generation are available, collecting similar +video data of the same scale is still challenging. Also, training a video +diffusion model is computationally much more expensive than its image +counterpart. In this work, we explore finetuning a pretrained image diffusion +model with video data as a practical solution for the video synthesis task. We +find that naively extending the image noise prior to video noise prior in video +diffusion leads to sub-optimal performance. Our carefully designed video noise +prior leads to substantially better performance. Extensive experimental +validation shows that our model, Preserve Your Own Correlation (PYoCo), attains +SOTA zero-shot text-to-video results on the UCF-101 and MSR-VTT benchmarks. It +also achieves SOTA video generation quality on the small-scale UCF-101 +benchmark with a $10\times$ smaller model using significantly less computation +than the prior art. + +
+
+ comment: ICCV 2023. Project webpage: + https://research.nvidia.com/labs/dir/pyoco +
+
+
+
+
+ + ♻ ☆ DR.CPO: Diversified and Realistic 3D Augmentation via Iterative + Construction, Random Placement, and HPR Occlusion + + +
+ In autonomous driving, data augmentation is commonly used for improving 3D +object detection. The most basic methods include insertion of copied objects +and rotation and scaling of the entire training frame. Numerous variants have +been developed as well. The existing methods, however, are considerably limited +when compared to the variety of the real world possibilities. In this work, we +develop a diversified and realistic augmentation method that can flexibly +construct a whole-body object, freely locate and rotate the object, and apply +self-occlusion and external-occlusion accordingly. To improve the diversity of +the whole-body object construction, we develop an iterative method that +stochastically combines multiple objects observed from the real world into a +single object. Unlike the existing augmentation methods, the constructed +objects can be randomly located and rotated in the training frame because +proper occlusions can be reflected to the whole-body objects in the final step. +Finally, proper self-occlusion at each local object level and +external-occlusion at the global frame level are applied using the Hidden Point +Removal (HPR) algorithm that is computationally efficient. HPR is also used for +adaptively controlling the point density of each object according to the +object's distance from the LiDAR. Experiment results show that the proposed +DR.CPO algorithm is data-efficient and model-agnostic without incurring any +computational overhead. Also, DR.CPO can improve mAP performance by 2.08% when +compared to the best 3D detection result known for KITTI dataset. The code is +available at https://github.com/SNU-DRL/DRCPO.git + +
+
+
+
+
+ + ♻ ☆ Multi-Response Heteroscedastic Gaussian Process Models and Their + Inference + + +
+ Despite the widespread utilization of Gaussian process models for versatile +nonparametric modeling, they exhibit limitations in effectively capturing +abrupt changes in function smoothness and accommodating relationships with +heteroscedastic errors. Addressing these shortcomings, the heteroscedastic +Gaussian process (HeGP) regression seeks to introduce flexibility by +acknowledging the variability of residual variances across covariates in the +regression model. In this work, we extend the HeGP concept, expanding its scope +beyond regression tasks to encompass classification and state-space models. To +achieve this, we propose a novel framework where the Gaussian process is +coupled with a covariate-induced precision matrix process, adopting a mixture +formulation. This approach enables the modeling of heteroscedastic covariance +functions across covariates. To mitigate the computational challenges posed by +sampling, we employ variational inference to approximate the posterior and +facilitate posterior predictive modeling. Additionally, our training process +leverages an EM algorithm featuring closed-form M-step updates to efficiently +evaluate the heteroscedastic covariance function. A notable feature of our +model is its consistent performance on multivariate responses, accommodating +various types (continuous or categorical) seamlessly. Through a combination of +simulations and real-world applications in climatology, we illustrate the +model's prowess and advantages. By overcoming the limitations of traditional +Gaussian process models, our proposed framework offers a robust and versatile +tool for a wide array of applications. + +
+
+ comment: submitted to the Journal of the American Statistical Association + (JASA) +
+
+
+
+
+ + ♻ ☆ DNAGPT: A Generalized Pre-trained Tool for Versatile DNA Sequence + Analysis Tasks + + +
+ Pre-trained large language models demonstrate potential in extracting +information from DNA sequences, yet adapting to a variety of tasks and data +modalities remains a challenge. To address this, we propose DNAGPT, a +generalized DNA pre-training model trained on over 200 billion base pairs from +all mammals. By enhancing the classic GPT model with a binary classification +task (DNA sequence order), a numerical regression task (guanine-cytosine +content prediction), and a comprehensive token language, DNAGPT can handle +versatile DNA analysis tasks while processing both sequence and numerical data. +Our evaluation of genomic signal and region recognition, mRNA abundance +regression, and artificial genomes generation tasks demonstrates DNAGPT's +superior performance compared to existing models designed for specific +downstream tasks, benefiting from pre-training using the newly designed model +structure. + +
+
+
+
+
+ + ♻ ☆ Expressive Text-to-Image Generation with Rich Text ICCV 2023 + + +
+ Plain text has become a prevalent interface for text-to-image synthesis. +However, its limited customization options hinder users from accurately +describing desired outputs. For example, plain text makes it hard to specify +continuous quantities, such as the precise RGB color value or importance of +each word. Furthermore, creating detailed text prompts for complex scenes is +tedious for humans to write and challenging for text encoders to interpret. To +address these challenges, we propose using a rich-text editor supporting +formats such as font style, size, color, and footnote. We extract each word's +attributes from rich text to enable local style control, explicit token +reweighting, precise color rendering, and detailed region synthesis. We achieve +these capabilities through a region-based diffusion process. We first obtain +each word's region based on attention maps of a diffusion process using plain +text. For each region, we enforce its text attributes by creating +region-specific detailed prompts and applying region-specific guidance, and +maintain its fidelity against plain-text generation through region-based +injections. We present various examples of image generation from rich text and +demonstrate that our method outperforms strong baselines with quantitative +evaluations. + +
+
+ comment: ICCV 2023. Project webpage: https://rich-text-to-image.github.io/ +
+
+
+
+
+ + ♻ ☆ From Chaos Comes Order: Ordering Event Representations for Object + Recognition and Detection ICCV 2023 + + +
+ Today, state-of-the-art deep neural networks that process events first +convert them into dense, grid-like input representations before using an +off-the-shelf network. However, selecting the appropriate representation for +the task traditionally requires training a neural network for each +representation and selecting the best one based on the validation score, which +is very time-consuming. This work eliminates this bottleneck by selecting +representations based on the Gromov-Wasserstein Discrepancy (GWD) between raw +events and their representation. It is about 200 times faster to compute than +training a neural network and preserves the task performance ranking of event +representations across multiple representations, network backbones, datasets, +and tasks. Thus finding representations with high task scores is equivalent to +finding representations with a low GWD. We use this insight to, for the first +time, perform a hyperparameter search on a large family of event +representations, revealing new and powerful representations that exceed the +state-of-the-art. Our optimized representations outperform existing +representations by 1.7 mAP on the 1 Mpx dataset and 0.3 mAP on the Gen1 +dataset, two established object detection benchmarks, and reach a 3.8% higher +classification score on the mini N-ImageNet benchmark. Moreover, we outperform +state-of-the-art by 2.1 mAP on Gen1 and state-of-the-art feed-forward methods +by 6.0 mAP on the 1 Mpx datasets. This work opens a new unexplored field of +explicit representation optimization for event-based learning. + +
+
+ comment: 15 pages, 11 figures, 2 tables, ICCV 2023 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Learning Melanocytic Cell Masks from Adjacent Stained Tissue MICCAI + + +
+ Melanoma is one of the most aggressive forms of skin cancer, causing a large +proportion of skin cancer deaths. However, melanoma diagnoses by pathologists +shows low interrater reliability. As melanoma is a cancer of the melanocyte, +there is a clear need to develop a melanocytic cell segmentation tool that is +agnostic to pathologist variability and automates pixel-level annotation. +Gigapixel-level pathologist labeling, however, is impractical. Herein, we +propose a means to train deep neural networks for melanocytic cell segmentation +from hematoxylin and eosin (H&E) stained sections and paired +immunohistochemistry (IHC) of adjacent tissue sections, achieving a mean IOU of +0.64 despite imperfect ground-truth labels. + +
+
+ comment: Accepted at Medical Image Learning with Limited & Noisy Data + Workshop, Medical Image Computing and Computer Assisted Interventions + (MICCAI) 2022 +
+
+
+
+
+ + ♻ ☆ A Stable and Scalable Method for Solving Initial Value PDEs with Neural + Networks ICLR 2023 + + +
+ Unlike conventional grid and mesh based methods for solving partial +differential equations (PDEs), neural networks have the potential to break the +curse of dimensionality, providing approximate solutions to problems where +using classical solvers is difficult or impossible. While global minimization +of the PDE residual over the network parameters works well for boundary value +problems, catastrophic forgetting impairs the applicability of this approach to +initial value problems (IVPs). In an alternative local-in-time approach, the +optimization problem can be converted into an ordinary differential equation +(ODE) on the network parameters and the solution propagated forward in time; +however, we demonstrate that current methods based on this approach suffer from +two key issues. First, following the ODE produces an uncontrolled growth in the +conditioning of the problem, ultimately leading to unacceptably large numerical +errors. Second, as the ODE methods scale cubically with the number of model +parameters, they are restricted to small neural networks, significantly +limiting their ability to represent intricate PDE initial conditions and +solutions. Building on these insights, we develop Neural IVP, an ODE based IVP +solver which prevents the network from getting ill-conditioned and runs in time +linear in the number of parameters, enabling us to evolve the dynamics of +challenging PDEs with neural networks. + +
+
+ comment: ICLR 2023. Code available at https://github.com/mfinzi/neural-ivp +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Prompting Vision Language Model with Knowledge from Large Language Model + for Knowledge-Based VQA + + +
+ Knowledge-based visual question answering is a very challenging and widely +concerned task. Previous methods adopts the implicit knowledge in large +language models (LLM) to achieve excellent results, but we argue that existing +methods may suffer from biasing understanding of the image and insufficient +knowledge to solve the problem. In this paper, we propose PROOFREAD -PROmpting +vision language model with knOwledge From laRgE lAnguage moDel, a novel, +lightweight and efficient kowledge-based VQA framework, which make the vision +language model and the large language model cooperate to give full play to +their respective strengths and bootstrap each other. In detail, our proposed +method uses LLM to obtain knowledge explicitly, uses the vision language model +which can see the image to get the knowledge answer, and introduces knowledge +perceiver to filter out knowledge that is harmful for getting the correct final +answer. Experimental results on two datasets prove the effectiveness of our +approach. Our method outperforms all state-of-the-art methods on the A-OKVQA +dataset in two settings and also achieves relatively good performance on the +OKVQA dataset. + +
+
+
+
+
+ + ☆ It Takes a Village: Multidisciplinarity and Collaboration for the + Development of Embodied Conversational Agents + + +
+ Embodied conversational agent (ECA) development is a time-consuming and +costly process that calls for knowledge in a plethora of different and not +necessarily adjacent disciplines. Engaging in activities outside of one's core +research to acquire peripheral skills can impede innovation and potentially +restrict the outcomes within the boundaries of those acquired skills. A +proposal to tackle this challenge is creating collaborative communities of +experts from the contributing disciplines to the field of ECAs that via clearly +defined roles, expectations and communication channels can help extend the +field of ECA research. + +
+
+ comment: 5 pages, 1 figure, ACM CUI 2023: Proceedings of the 5th Conference on + Conversational User Interfaces - Is CUI ready yet?, This paper discusses the + challenges of ECA development and how they can be tackled via + multidisciplinary collaboration +
+
+
+
+
+ + ☆ Deep Video Codec Control + + +
+ Lossy video compression is commonly used when transmitting and storing video +data. Unified video codecs (e.g., H.264 or H.265) remain the \emph{de facto} +standard, despite the availability of advanced (neural) compression approaches. +Transmitting videos in the face of dynamic network bandwidth conditions +requires video codecs to adapt to vastly different compression strengths. Rate +control modules augment the codec's compression such that bandwidth constraints +are satisfied and video distortion is minimized. While, both standard video +codes and their rate control modules are developed to minimize video distortion +w.r.t. human quality assessment, preserving the downstream performance of deep +vision models is not considered. In this paper, we present the first end-to-end +learnable deep video codec control considering both bandwidth constraints and +downstream vision performance, while not breaking existing standardization. We +demonstrate for two common vision tasks (semantic segmentation and optical flow +estimation) and on two different datasets that our deep codec control better +preserves downstream performance than using 2-pass average bit rate control +while meeting dynamic bandwidth constraints and adhering to standardizations. + +
+
+ comment: 22 pages, 26 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 56 + +
+
+
+ + ☆ ParaGuide: Guided Diffusion Paraphrasers for Plug-and-Play Textual Style + Transfer + + +
+ Textual style transfer is the task of transforming stylistic properties of +text while preserving meaning. Target "styles" can be defined in numerous ways, +ranging from single attributes (e.g, formality) to authorship (e.g, +Shakespeare). Previous unsupervised style-transfer approaches generally rely on +significant amounts of labeled data for only a fixed set of styles or require +large language models. In contrast, we introduce a novel diffusion-based +framework for general-purpose style transfer that can be flexibly adapted to +arbitrary target styles at inference time. Our parameter-efficient approach, +ParaGuide, leverages paraphrase-conditioned diffusion models alongside +gradient-based guidance from both off-the-shelf classifiers and strong existing +style embedders to transform the style of text while preserving semantic +information. We validate the method on the Enron Email Corpus, with both human +and automatic evaluations, and find that it outperforms strong baselines on +formality, sentiment, and even authorship style transfer. + +
+
+
+
+
+ + ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ The reasoning capabilities of Large Language Models (LLMs) play a pivotal +role in the realm of embodied artificial intelligence. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Vulgar Remarks Detection in Chittagonian Dialect of Bangla + + +
+ The negative effects of online bullying and harassment are increasing with +Internet popularity, especially in social media. One solution is using natural +language processing (NLP) and machine learning (ML) methods for the automatic +detection of harmful remarks, but these methods are limited in low-resource +languages like the Chittagonian dialect of Bangla.This study focuses on +detecting vulgar remarks in social media using supervised ML and deep learning +algorithms.Logistic Regression achieved promising accuracy (0.91) while simple +RNN with Word2vec and fastTex had lower accuracy (0.84-0.90), highlighting the +issue that NN algorithms require more data. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Characterizing Learning Curves During Language Model Pre-Training: + Learning, Forgetting, and Stability + + +
+ How do language models learn to make predictions during pre-training? To +study this question, we extract learning curves from five autoregressive +English language model pre-training runs, for 1M tokens in context. We observe +that the language models generate short repetitive phrases before learning to +generate longer and more coherent text. We quantify the final surprisal, +within-run variability, age of acquisition, forgettability, and cross-run +variability of learning curves for individual tokens in context. More frequent +tokens reach lower final surprisals, exhibit less variability within and across +pre-training runs, are learned earlier, and are less likely to be "forgotten" +during pre-training. Higher n-gram probabilities further accentuate these +effects. Independent of the target token, shorter and more frequent contexts +correlate with marginally more stable and quickly acquired predictions. Effects +of part-of-speech are also small, although nouns tend to be acquired later and +less stably than verbs, adverbs, and adjectives. Our work contributes to a +better understanding of language model pre-training dynamics and informs the +deployment of stable language models in practice. + +
+
+
+
+
+ + ☆ Rethinking Machine Ethics -- Can LLMs Perform Moral Reasoning through + the Lens of Moral Theories? + + +
+ Making moral judgments is an essential step toward developing ethical AI +systems. Prevalent approaches are mostly implemented in a bottom-up manner, +which uses a large set of annotated data to train models based on crowd-sourced +opinions about morality. These approaches have been criticized for potentially +overgeneralizing a limited group of annotators' moral stances and lacking +explainability. In contrast, top-down approaches make moral judgments grounded +in a set of principles. However, it remains conceptual due to the incapability +of previous language models and the unsolved debate among moral principles. In +this study, we propose a flexible framework to steer Large Language Models +(LLMs) to perform moral reasoning with well-established moral theories from +interdisciplinary research. The theory-guided top-down framework can +incorporate various moral theories. Our experiments demonstrate the +effectiveness of the proposed framework on datasets derived from moral +theories. Furthermore, we show the alignment between different moral theories +and existing morality datasets. Our analysis exhibits the potentials and flaws +in existing resources (models and datasets) in developing explainable moral +judgment-making systems. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborates their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. +Towards an efficient and economic LLM-based Text-to-SQL solution, we emphasize +the token efficiency in prompt engineering and compare the prior studies under +this metric. Additionally, we investigate open-source LLMs in in-context +learning, and further enhance their performance with task-specific supervised +fine-tuning. Our explorations highlight open-source LLMs' potential in +Text-to-SQL, as well as the advantages and disadvantages of the task-specific +supervised fine-tuning. We hope that our work provides a deeper understanding +of Text-to-SQL with LLMs, and inspire further investigations and broad +applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ☆ Historical patterns of rice farming explain modern-day language use in + China and Japan more than modernization and urbanization + + +
+ We used natural language processing to analyze a billion words to study +cultural differences on Weibo, one of China's largest social media platforms. +We compared predictions from two common explanations about cultural differences +in China (economic development and urban-rural differences) against the +less-obvious legacy of rice versus wheat farming. Rice farmers had to +coordinate shared irrigation networks and exchange labor to cope with higher +labor requirements. In contrast, wheat relied on rainfall and required half as +much labor. We test whether this legacy made southern China more +interdependent. Across all word categories, rice explained twice as much +variance as economic development and urbanization. Rice areas used more words +reflecting tight social ties, holistic thought, and a cautious, prevention +orientation. We then used Twitter data comparing prefectures in Japan, which +largely replicated the results from China. This provides crucial evidence of +the rice theory in a different nation, language, and platform. + +
+
+ comment: Includes Supplemental Materials +
+
+
+
+
+ + ☆ A Framework for Responsible Development of Automated Student Feedback + with Generative AI + + +
+ Providing rich feedback to students is essential for supporting student +learning. Recent advances in generative AI, particularly within large language +modelling (LLM), provide the opportunity to deliver repeatable, scalable and +instant automatically generated feedback to students, making abundant a +previously scarce and expensive learning resource. Such an approach is feasible +from a technical perspective due to these recent advances in Artificial +Intelligence (AI) and Natural Language Processing (NLP); while the potential +upside is a strong motivator, doing so introduces a range of potential ethical +issues that must be considered as we apply these technologies. The +attractiveness of AI systems is that they can effectively automate the most +mundane tasks; but this risks introducing a "tyranny of the majority", where +the needs of minorities in the long tail are overlooked because they are +difficult to automate. + Developing machine learning models that can generate valuable and authentic +feedback requires the input of human domain experts. The choices we make in +capturing this expertise -- whose, which, when, and how -- will have +significant consequences for the nature of the resulting feedback. How we +maintain our models will affect how that feedback remains relevant given +temporal changes in context, theory, and prior learning profiles of student +cohorts. These questions are important from an ethical perspective; but they +are also important from an operational perspective. Unless they can be +answered, our AI generated systems will lack the trust necessary for them to be +useful features in the contemporary learning environment. + This article will outline the frontiers of automated feedback, identify the +ethical issues involved in the provision of automated feedback and present a +framework to assist academics to develop such systems responsibly. + +
+
+ comment: 10 pages, under review at IEEE TLT +
+
+
+
+
+ + ☆ TaskLAMA: Probing the Complex Task Understanding of Language Models + + +
+ Structured Complex Task Decomposition (SCTD) is the problem of breaking down +a complex real-world task (such as planning a wedding) into a directed acyclic +graph over individual steps that contribute to achieving the task, with edges +specifying temporal dependencies between them. SCTD is an important component +of assistive planning tools, and a challenge for commonsense reasoning systems. +We probe how accurately SCTD can be done with the knowledge extracted from +Large Language Models (LLMs). We introduce a high-quality human-annotated +dataset for this problem and novel metrics to fairly assess performance of LLMs +against several baselines. Our experiments reveal that LLMs are able to +decompose complex tasks into individual steps effectively, with a relative +improvement of 15% to 280% over the best baseline. We also propose a number of +approaches to further improve their performance, with a relative improvement of +7% to 37% over the base model. However, we find that LLMs still struggle to +predict pairwise temporal dependencies, which reveals a gap in their +understanding of complex tasks. + +
+
+
+
+
+ + ☆ KGConv, a Conversational Corpus grounded in Wikidata + + +
+ We present KGConv, a large, conversational corpus of 71k conversations where +each question-answer pair is grounded in a Wikidata fact. Conversations contain +on average 8.6 questions and for each Wikidata fact, we provide multiple +variants (12 on average) of the corresponding question using templates, human +annotations, hand-crafted rules and a question rewriting neural model. We +provide baselines for the task of Knowledge-Based, Conversational Question +Generation. KGConv can further be used for other generation and analysis tasks +such as single-turn question generation from Wikidata triples, question +rewriting, question answering from conversation or from knowledge graphs and +quiz generation. + +
+
+
+
+
+ + ☆ Enhancing OCR Performance through Post-OCR Models: Adopting Glyph + Embedding for Improved Correction + + +
+ The study investigates the potential of post-OCR models to overcome +limitations in OCR models and explores the impact of incorporating glyph +embedding on post-OCR correction performance. In this study, we have developed +our own post-OCR correction model. The novelty of our approach lies in +embedding the OCR output using CharBERT and our unique embedding technique, +capturing the visual characteristics of characters. Our findings show that +post-OCR correction effectively addresses deficiencies in inferior OCR models, +and glyph embedding enables the model to achieve superior results, including +the ability to correct individual words. + +
+
+
+
+
+ + ☆ A Classification-Guided Approach for Adversarial Attacks against Neural + Machine Translation + + +
+ Neural Machine Translation (NMT) models have been shown to be vulnerable to +adversarial attacks, wherein carefully crafted perturbations of the input can +mislead the target model. In this paper, we introduce ACT, a novel adversarial +attack framework against NMT systems guided by a classifier. In our attack, the +adversary aims to craft meaning-preserving adversarial examples whose +translations by the NMT model belong to a different class than the original +translations in the target language. Unlike previous attacks, our new approach +has a more substantial effect on the translation by altering the overall +meaning, which leads to a different class determined by a classifier. To +evaluate the robustness of NMT models to this attack, we propose enhancements +to existing black-box word-replacement-based attacks by incorporating output +translations of the target NMT model and the output logits of a classifier +within the attack process. Extensive experiments in various settings, including +a comparison with existing untargeted attacks, demonstrate that the proposed +attack is considerably more successful in altering the class of the output +translation and has more effect on the translation. This new paradigm can show +the vulnerabilities of NMT systems by focusing on the class of translation +rather than the mere translation quality as studied traditionally. + +
+
+
+
+
+ + ☆ PronounFlow: A Hybrid Approach for Calibrating Pronouns in Sentences + + +
+ Flip through any book or listen to any song lyrics, and you will come across +pronouns that, in certain cases, can hinder meaning comprehension, especially +for machines. As the role of having cognitive machines becomes pervasive in our +lives, numerous systems have been developed to resolve pronouns under various +challenges. Commensurate with this, it is believed that having systems able to +disambiguate pronouns in sentences will help towards the endowment of machines +with commonsense and reasoning abilities like those found in humans. However, +one problem these systems face with modern English is the lack of gender +pronouns, where people try to alternate by using masculine, feminine, or plural +to avoid the whole issue. Since humanity aims to the building of systems in the +full-bodied sense we usually reserve for people, what happens when pronouns in +written text, like plural or epicene ones, refer to unspecified entities whose +gender is not necessarily known? Wouldn't that put extra barriers to existing +coreference resolution systems? Towards answering those questions, through the +implementation of a neural-symbolic system that utilizes the best of both +worlds, we are employing PronounFlow, a system that reads any English sentence +with pronouns and entities, identifies which of them are not tied to each +other, and makes suggestions on which to use to avoid biases. Undertaken +experiments show that PronounFlow not only alternates pronouns in sentences +based on the collective human knowledge around us but also considerably helps +coreference resolution systems with the pronoun disambiguation process. + +
+
+ comment: 13 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Classification-Aware Neural Topic Model Combined With Interpretable + Analysis -- For Conflict Classification + + +
+ A large number of conflict events are affecting the world all the time. In +order to analyse such conflict events effectively, this paper presents a +Classification-Aware Neural Topic Model (CANTM-IA) for Conflict Information +Classification and Topic Discovery. The model provides a reliable +interpretation of classification results and discovered topics by introducing +interpretability analysis. At the same time, interpretation is introduced into +the model architecture to improve the classification performance of the model +and to allow interpretation to focus further on the details of the data. +Finally, the model architecture is optimised to reduce the complexity of the +model. + +
+
+ comment: Accepted by RANLP 2023 +
+
+
+
+
+ + ☆ Multi-party Goal Tracking with LLMs: Comparing Pre-training, + Fine-tuning, and Prompt Engineering + + +
+ This paper evaluates the extent to which current Large Language Models (LLMs) +can capture task-oriented multi-party conversations (MPCs). We have recorded +and transcribed 29 MPCs between patients, their companions, and a social robot +in a hospital. We then annotated this corpus for multi-party goal-tracking and +intent-slot recognition. People share goals, answer each other's goals, and +provide other people's goals in MPCs - none of which occur in dyadic +interactions. To understand user goals in MPCs, we compared three methods in +zero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks +to train DialogLM using LED, and employed prompt engineering techniques with +GPT-3.5-turbo, to determine which approach can complete this novel task with +limited data. GPT-3.5-turbo significantly outperformed the others in a few-shot +setting. The `reasoning' style prompt, when given 7% of the corpus as example +annotated conversations, was the best performing method. It correctly annotated +62.32% of the goal tracking MPCs, and 69.57% of the intent-slot recognition +MPCs. A `story' style prompt increased model hallucination, which could be +detrimental if deployed in safety-critical settings. We conclude that +multi-party conversations still challenge state-of-the-art LLMs. + +
+
+ comment: Accepted and will appear in the Proceedings of SIGdial 2023 +
+
+
+
+
+ + ☆ CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for + Multimodal Machine Translation ICCV + + +
+ There has been a growing interest in developing multimodal machine +translation (MMT) systems that enhance neural machine translation (NMT) with +visual knowledge. This problem setup involves using images as auxiliary +information during training, and more recently, eliminating their use during +inference. Towards this end, previous works face a challenge in training +powerful MMT models from scratch due to the scarcity of annotated multilingual +vision-language data, especially for low-resource languages. Simultaneously, +there has been an influx of multilingual pre-trained models for NMT and +multimodal pre-trained models for vision-language tasks, primarily in English, +which have shown exceptional generalisation ability. However, these are not +directly applicable to MMT since they do not provide aligned multimodal +multilingual features for generative tasks. To alleviate this issue, instead of +designing complex modules for MMT, we propose CLIPTrans, which simply adapts +the independently pre-trained multimodal M-CLIP and the multilingual mBART. In +order to align their embedding spaces, mBART is conditioned on the M-CLIP +features by a prefix sequence generated through a lightweight mapping network. +We train this in a two-stage pipeline which warms up the model with image +captioning before the actual translation task. Through experiments, we +demonstrate the merits of this framework and consequently push forward the +state-of-the-art across standard benchmarks by an average of +2.67 BLEU. The +code can be found at www.github.com/devaansh100/CLIPTrans. + +
+
+ comment: 15 pages, 9 figures, to be published In Proceedings of International + Conference of Computer Vision(ICCV), 2023 +
+
+
+
+
+ + ☆ FurChat: An Embodied Conversational Agent using LLMs, Combining Open and + Closed-Domain Dialogue with Facial Expressions SIGDIAL 2023 + + +
+ We demonstrate an embodied conversational agent that can function as a +receptionist and generate a mixture of open and closed-domain dialogue along +with facial expressions, by using a large language model (LLM) to develop an +engaging conversation. We deployed the system onto a Furhat robot, which is +highly expressive and capable of using both verbal and nonverbal cues during +interaction. The system was designed specifically for the National Robotarium +to interact with visitors through natural conversations, providing them with +information about the facilities, research, news, upcoming events, etc. The +system utilises the state-of-the-art GPT-3.5 model to generate such information +along with domain-general conversations and facial expressions based on prompt +engineering. + +
+
+ comment: 5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the + Special Interest Group on Discourse and Dialogue), for the demo video, see + https://youtu.be/fwtUl1kl22s +
+
+
+
+
+ + ☆ Shared Lexical Items as Triggers of Code Switching ACL + + +
+ Why do bilingual speakers code-switch (mix their two languages)? Among the +several theories that attempt to explain this natural and ubiquitous +phenomenon, the Triggering Hypothesis relates code-switching to the presence of +lexical triggers, specifically cognates and proper names, adjacent to the +switch point. We provide a fuller, more nuanced and refined exploration of the +triggering hypothesis, based on five large datasets in three language pairs, +reflecting both spoken and written bilingual interactions. Our results show +that words that are assumed to reside in a mental lexicon shared by both +languages indeed trigger code-switching; that the tendency to switch depends on +the distance of the trigger from the switch point; and on whether the trigger +precedes or succeeds the switch; but not on the etymology of the trigger words. +We thus provide strong, robust, evidence-based confirmation to several +hypotheses on the relationships between lexical triggers and code-switching. + +
+
+ comment: This is the author's final version; the article has been accepted for + publication in the Transactions of the Association for Computational + Linguistics (TACL) +
+
+
+
+
+ + ☆ Benchmarking the Generation of Fact Checking Explanations ACL + + +
+ Fighting misinformation is a challenging, yet crucial, task. Despite the +growing number of experts being involved in manual fact-checking, this activity +is time-consuming and cannot keep up with the ever-increasing amount of Fake +News produced daily. Hence, automating this process is necessary to help curb +misinformation. Thus far, researchers have mainly focused on claim veracity +classification. In this paper, instead, we address the generation of +justifications (textual explanation of why a claim is classified as either true +or false) and benchmark it with novel datasets and advanced baselines. In +particular, we focus on summarization approaches over unstructured knowledge +(i.e. news articles) and we experiment with several extractive and abstractive +strategies. We employed two datasets with different styles and structures, in +order to assess the generalizability of our findings. Results show that in +justification production summarization benefits from the claim information, +and, in particular, that a claim-driven extractive step improves abstractive +summarization performances. Finally, we show that although cross-dataset +experiments suffer from performance degradation, a unique model trained on a +combination of the two datasets is able to retain style information in an +efficient manner. + +
+
+ comment: Accepted to TACL. This arXiv version is a pre-MIT Press publication + version +
+
+
+
+
+ + ☆ Enhancing Psychological Counseling with Large Language Model: A + Multifaceted Decision-Support System for Non-Professionals + + +
+ In the contemporary landscape of social media, an alarming number of users +express negative emotions, some of which manifest as strong suicidal +intentions. This situation underscores a profound need for trained +psychological counselors who can enact effective mental interventions. However, +the development of these professionals is often an imperative but +time-consuming task. Consequently, the mobilization of non-professionals or +volunteers in this capacity emerges as a pressing concern. Leveraging the +capabilities of artificial intelligence, and in particular, the recent advances +in large language models, offers a viable solution to this challenge. This +paper introduces a novel model constructed on the foundation of large language +models to fully assist non-professionals in providing psychological +interventions on online user discourses. This framework makes it plausible to +harness the power of non-professional counselors in a meaningful way. A +comprehensive study was conducted involving ten professional psychological +counselors of varying expertise, evaluating the system across five critical +dimensions. The findings affirm that our system is capable of analyzing +patients' issues with relative accuracy and proffering professional-level +strategies recommendations, thereby enhancing support for non-professionals. +This research serves as a compelling validation of the application of large +language models in the field of psychology and lays the groundwork for a new +paradigm of community-based mental health support. + +
+
+
+
+
+ + ☆ The Anatomy of Conspirators: Unveiling Traits using a Comprehensive + Twitter Dataset + + +
+ The discourse around conspiracy theories is currently thriving amidst the +rampant misinformation prevalent in online environments. Research in this field +has been focused on detecting conspiracy theories on social media, often +relying on limited datasets. In this study, we present a novel methodology for +constructing a Twitter dataset that encompasses accounts engaged in +conspiracy-related activities throughout the year 2022. Our approach centers on +data collection that is independent of specific conspiracy theories and +information operations. Additionally, our dataset includes a control group +comprising randomly selected users who can be fairly compared to the +individuals involved in conspiracy activities. This comprehensive collection +effort yielded a total of 15K accounts and 37M tweets extracted from their +timelines. We conduct a comparative analysis of the two groups across three +dimensions: topics, profiles, and behavioral characteristics. The results +indicate that conspiracy and control users exhibit similarity in terms of their +profile metadata characteristics. However, they diverge significantly in terms +of behavior and activity, particularly regarding the discussed topics, the +terminology used, and their stance on trending subjects. Interestingly, there +is no significant disparity in the presence of bot users between the two +groups, suggesting that conspiracy and automation are orthogonal concepts. +Finally, we develop a classifier to identify conspiracy users using 93 +features, some of which are commonly employed in literature for troll +identification. The results demonstrate a high accuracy level (with an average +F1 score of 0.98%), enabling us to uncover the most discriminative features +associated with conspiracy-related accounts. + +
+
+
+
+
+ + ☆ Evaluation and Analysis of Hallucination in Large Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) have recently achieved remarkable +success. However, LVLMs are still plagued by the hallucination problem, which +limits the practicality in many scenarios. Hallucination refers to the +information of LVLMs' responses that does not exist in the visual input, which +poses potential risks of substantial consequences. There has been limited work +studying hallucination evaluation in LVLMs. In this paper, we propose +Hallucination Evaluation based on Large Language Models (HaELM), an LLM-based +hallucination evaluation framework. HaELM achieves an approximate 95% +performance comparable to ChatGPT and has additional advantages including low +cost, reproducibility, privacy preservation and local deployment. Leveraging +the HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we +analyze the factors contributing to hallucination in LVLMs and offer helpful +suggestions to mitigate the hallucination problem. Our training data and human +annotation hallucination data will be made public soon. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge + Distillation from BERT + + +
+ Spiking neural networks (SNNs) offer a promising avenue to implement deep +neural networks in a more energy-efficient way. However, the network +architectures of existing SNNs for language tasks are too simplistic, and deep +architectures have not been fully explored, resulting in a significant +performance gap compared to mainstream transformer-based networks such as BERT. +To this end, we improve a recently-proposed spiking transformer (i.e., +Spikformer) to make it possible to process language tasks and propose a +two-stage knowledge distillation method for training it, which combines +pre-training by distilling knowledge from BERT with a large collection of +unlabelled texts and fine-tuning with task-specific instances via knowledge +distillation again from the BERT fine-tuned on the same training examples. +Through extensive experimentation, we show that the models trained with our +method, named SpikeBERT, outperform state-of-the-art SNNs and even achieve +comparable results to BERTs on text classification tasks for both English and +Chinese with much less energy consumption. + +
+
+
+
+
+ + ☆ Large Language Models on the Chessboard: A Study on ChatGPT's Formal + Language Comprehension and Complex Reasoning Skills + + +
+ While large language models have made strides in natural language processing, +their proficiency in complex reasoning tasks requiring formal language +comprehension, such as chess, remains less investigated. This paper probes the +performance of ChatGPT, a sophisticated language model by OpenAI in tackling +such complex reasoning tasks, using chess as a case study. Through robust +metrics examining both the legality and quality of moves, we assess ChatGPT's +understanding of the chessboard, adherence to chess rules, and strategic +decision-making abilities. Our evaluation identifies limitations within +ChatGPT's attention mechanism that affect its formal language comprehension and +uncovers the model's underdeveloped self-regulation abilities. Our study also +reveals ChatGPT's propensity for a coherent strategy in its gameplay and a +noticeable uptick in decision-making assertiveness when the model is presented +with a greater volume of natural language or possesses a more lucid +understanding of the state of the chessboard. These findings contribute to the +growing exploration of language models' abilities beyond natural language +processing, providing valuable information for future research towards models +demonstrating human-like cognitive abilities. + +
+
+
+
+
+ + ☆ Sequential annotations for naturally-occurring HRI: first insights + + +
+ We explain the methodology we developed for improving the interactions +accomplished by an embedded conversational agent, drawing from Conversation +Analytic sequential and multimodal analysis. The use case is a Pepper robot +that is expected to inform and orient users in a library. In order to propose +and learn better interactive schema, we are creating a corpus of +naturally-occurring interactions that will be made available to the community. +To do so, we propose an annotation practice based on some theoretical +underpinnings about the use of language and multimodal resources in human-robot +interaction. CCS CONCEPTS $\bullet$ Computing methodologies $\rightarrow$ +Discourse, dialogue and pragmatics; $\bullet$ Human-centered computing +$\rightarrow$ Text input; HCI theory, concepts and models; Field studies. + +
+
+ comment: Peer-reviewed workshop paper accepted for the ''Human-Robot + Conversational Interaction'' workshop that took place at the ''ACM/IEEE + International Conference on Human-Robot Interaction'' 2023 Conference in + Stockholm, Sweden +
+
+
+
+
+ + ☆ Killing two birds with one stone: Can an audio captioning system also be + used for audio-text retrieval? + + +
+ Automated Audio Captioning (AAC) aims to develop systems capable of +describing an audio recording using a textual sentence. In contrast, Audio-Text +Retrieval (ATR) systems seek to find the best matching audio recording(s) for a +given textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks +require different types of systems: AAC employs a sequence-to-sequence model, +while ATR utilizes a ranking model that compares audio and text representations +within a shared projection subspace. However, this work investigates the +relationship between AAC and ATR by exploring the ATR capabilities of an +unmodified AAC system, without fine-tuning for the new task. Our AAC system +consists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio +tagging, and a transformer decoder responsible for generating sentences. For +AAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on +AudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss +values obtained for any audio/caption pair. Experimental results on the Clotho +and AudioCaps datasets demonstrate decent recall values using this simple +approach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for +Au-dioCaps, which is above the current state-of-the-art method without external +data. Interestingly, we observe that normalizing the loss values was necessary +for Audio-to-Text retrieval. + +
+
+ comment: cam ready version (14/08/23) +
+
+
+
+
+ + ☆ Taxonomic Loss for Morphological Glossing of Low-Resource Languages + + +
+ Morpheme glossing is a critical task in automated language documentation and +can benefit other downstream applications greatly. While state-of-the-art +glossing systems perform very well for languages with large amounts of existing +data, it is more difficult to create useful models for low-resource languages. +In this paper, we propose the use of a taxonomic loss function that exploits +morphological information to make morphological glossing more performant when +data is scarce. We find that while the use of this loss function does not +outperform a standard loss function with regards to single-label prediction +accuracy, it produces better predictions when considering the top-n predicted +labels. We suggest this property makes the taxonomic loss function useful in a +human-in-the-loop annotation setting. + +
+
+
+
+
+ + ☆ Adapting text-based dialogue state tracker for spoken dialogues SIGDIAL 2023 + + +
+ Although there have been remarkable advances in dialogue systems through the +dialogue systems technology competition (DSTC), it remains one of the key +challenges to building a robust task-oriented dialogue system with a speech +interface. Most of the progress has been made for text-based dialogue systems +since there are abundant datasets with written corpora while those with spoken +dialogues are very scarce. However, as can be seen from voice assistant systems +such as Siri and Alexa, it is of practical importance to transfer the success +to spoken dialogues. In this paper, we describe our engineering effort in +building a highly successful model that participated in the speech-aware +dialogue systems technology challenge track in DSTC11. Our model consists of +three major modules: (1) automatic speech recognition error correction to +bridge the gap between the spoken and the text utterances, (2) text-based +dialogue system (D3ST) for estimating the slots and values using slot +descriptions, and (3) post-processing for recovering the error of the estimated +slot value. Our experiments show that it is important to use an explicit +automatic speech recognition error correction module, post-processing, and data +augmentation to adapt a text-based dialogue state tracker for spoken dialogue +corpora. + +
+
+ comment: 8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at + SIGDIAL 2023 +
+
+
+
+
+ + ☆ Large language models converge toward human-like concept organization + + +
+ Large language models show human-like performance in knowledge extraction, +reasoning and dialogue, but it remains controversial whether this performance +is best explained by memorization and pattern matching, or whether it reflects +human-like inferential semantics and world knowledge. Knowledge bases such as +WikiData provide large-scale, high-quality representations of inferential +semantics and world knowledge. We show that large language models learn to +organize concepts in ways that are strikingly similar to how concepts are +organized in such knowledge bases. Knowledge bases model collective, +institutional knowledge, and large language models seem to induce such +knowledge from raw text. We show that bigger and better models exhibit more +human-like concept organization, across four families of language models and +three knowledge graph embeddings. + +
+
+
+
+
+ + ☆ Improving Neural Ranking Models with Traditional IR Methods + + +
+ Neural ranking methods based on large transformer models have recently gained +significant attention in the information retrieval community, and have been +adopted by major commercial solutions. Nevertheless, they are computationally +expensive to create, and require a great deal of labeled data for specialized +corpora. In this paper, we explore a low resource alternative which is a +bag-of-embedding model for document retrieval and find that it is competitive +with large transformer models fine tuned on information retrieval tasks. Our +results show that a simple combination of TF-IDF, a traditional keyword +matching method, with a shallow embedding model provides a low cost path to +compete well with the performance of complex neural ranking models on 3 +datasets. Furthermore, adding TF-IDF measures improves the performance of +large-scale fine tuned models on these tasks. + +
+
+ comment: Short paper, 4 pages +
+
+
+
+
+ + ☆ Recursively Summarizing Enables Long-Term Dialogue Memory in Large + Language Models + + +
+ Most open-domain dialogue systems suffer from forgetting important +information, especially in a long-term conversation. Existing works usually +train the specific retriever or summarizer to obtain key information from the +past, which is time-consuming and highly depends on the quality of labeled +data. To alleviate this problem, we propose to recursively generate summaries/ +memory using large language models (LLMs) to enhance long-term memory ability. +Specifically, our method first stimulates LLMs to memorize small dialogue +contexts and then recursively produce new memory using previous memory and +following contexts. Finally, the LLM can easily generate a highly consistent +response with the help of the latest memory. We evaluate our method using +ChatGPT and text-davinci-003, and the experiments on the widely-used public +dataset show that our method can generate more consistent responses in a +long-context conversation. Notably, our method is a potential solution to +enable the LLM to model the extremely long context. Code and scripts will be +released later. + +
+
+
+
+
+ + ☆ TransPrompt v2: A Transferable Prompting Framework for Cross-task Text + Classification + + +
+ Text classification is one of the most imperative tasks in natural language +processing (NLP). Recent advances with pre-trained language models (PLMs) have +shown remarkable success on this task. However, the satisfying results obtained +by PLMs heavily depend on the large amounts of task-specific labeled data, +which may not be feasible in many application scenarios due to data access and +privacy constraints. The recently-proposed prompt-based fine-tuning paradigm +improves the performance of PLMs for few-shot text classification with +task-specific templates. Yet, it is unclear how the prompting knowledge can be +transferred across tasks, for the purpose of mutual reinforcement. We propose +TransPrompt v2, a novel transferable prompting framework for few-shot learning +across similar or distant text classification tasks. For learning across +similar tasks, we employ a multi-task meta-knowledge acquisition (MMA) +procedure to train a meta-learner that captures the cross-task transferable +knowledge. For learning across distant tasks, we further inject the task type +descriptions into the prompt, and capture the intra-type and inter-type prompt +embeddings among multiple distant tasks. Additionally, two de-biasing +techniques are further designed to make the trained meta-learner more +task-agnostic and unbiased towards any tasks. After that, the meta-learner can +be adapted to each specific task with better parameters initialization. +Extensive experiments show that TransPrompt v2 outperforms single-task and +cross-task strong baselines over multiple NLP tasks and datasets. We further +show that the meta-learner can effectively improve the performance of PLMs on +previously unseen tasks. In addition, TransPrompt v2 also outperforms strong +fine-tuning baselines when learning with full training sets. + +
+
+
+
+
+ + ☆ Robust Open-Set Spoken Language Identification and the CU MultiLang + Dataset + + +
+ Most state-of-the-art spoken language identification models are closed-set; +in other words, they can only output a language label from the set of classes +they were trained on. Open-set spoken language identification systems, however, +gain the ability to detect when an input exhibits none of the original +languages. In this paper, we implement a novel approach to open-set spoken +language identification that uses MFCC and pitch features, a TDNN model to +extract meaningful feature embeddings, confidence thresholding on softmax +outputs, and LDA and pLDA for learning to classify new unknown languages. We +present a spoken language identification system that achieves 91.76% accuracy +on trained languages and has the capability to adapt to unknown languages on +the fly. To that end, we also built the CU MultiLang Dataset, a large and +diverse multilingual speech corpus which was used to train and evaluate our +system. + +
+
+ comment: 6pages, 1 table, 6 figures +
+
+
+
+
+ + ☆ Document AI: A Comparative Study of Transformer-Based, Graph-Based + Models, and Convolutional Neural Networks For Document Layout Analysis + + +
+ Document AI aims to automatically analyze documents by leveraging natural +language processing and computer vision techniques. One of the major tasks of +Document AI is document layout analysis, which structures document pages by +interpreting the content and spatial relationships of layout, image, and text. +This task can be image-centric, wherein the aim is to identify and label +various regions such as authors and paragraphs, or text-centric, where the +focus is on classifying individual words in a document. Although there are +increasingly sophisticated methods for improving layout analysis, doubts remain +about the extent to which their findings can be generalized to a broader +context. Specifically, prior work developed systems based on very different +architectures, such as transformer-based, graph-based, and CNNs. However, no +work has mentioned the effectiveness of these models in a comparative analysis. +Moreover, while language-independent Document AI models capable of knowledge +transfer have been developed, it remains to be investigated to what degree they +can effectively transfer knowledge. In this study, we aim to fill these gaps by +conducting a comparative evaluation of state-of-the-art models in document +layout analysis and investigating the potential of cross-lingual layout +analysis by utilizing machine translation techniques. + +
+
+
+
+
+ + ♻ ☆ A Deep Convolutional Neural Networks Based Multi-Task Ensemble Model for + Aspect and Polarity Classification in Persian Reviews + + +
+ Aspect-based sentiment analysis is of great importance and application +because of its ability to identify all aspects discussed in the text. However, +aspect-based sentiment analysis will be most effective when, in addition to +identifying all the aspects discussed in the text, it can also identify their +polarity. Most previous methods use the pipeline approach, that is, they first +identify the aspects and then identify the polarities. Such methods are +unsuitable for practical applications since they can lead to model errors. +Therefore, in this study, we propose a multi-task learning model based on +Convolutional Neural Networks (CNNs), which can simultaneously detect aspect +category and detect aspect category polarity. creating a model alone may not +provide the best predictions and lead to errors such as bias and high variance. +To reduce these errors and improve the efficiency of model predictions, +combining several models known as ensemble learning may provide better results. +Therefore, the main purpose of this article is to create a model based on an +ensemble of multi-task deep convolutional neural networks to enhance sentiment +analysis in Persian reviews. We evaluated the proposed method using a Persian +language dataset in the movie domain. Jacquard index and Hamming loss measures +were used to evaluate the performance of the developed models. The results +indicate that this new approach increases the efficiency of the sentiment +analysis model in the Persian language. + +
+
+
+
+
+ + ♻ ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this +gap and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT ADA without specific guidance. ChatGPT ADA autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT ADA offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a meticulously curated, comprehensive +instruction dataset expressly designed for the biomolecular realm. +Mol-Instructions is composed of three pivotal components: molecule-oriented +instructions, protein-oriented instructions, and biomolecular text +instructions, each curated to enhance the understanding and prediction +capabilities of LLMs concerning biomolecular features and behaviors. Through +extensive instruction tuning experiments on the representative LLM, we +underscore the potency of Mol-Instructions to enhance the adaptability and +cognitive acuity of large models within the complex sphere of biomolecular +studies, thereby promoting advancements in the biomolecular research community. +Mol-Instructions is made publicly accessible for future research endeavors and +will be subjected to continual updates for enhanced applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions. Add + quantitative evaluations +
+
+
+
+
+ + ♻ ☆ An Empirical Investigation of the Role of Pre-training in Lifelong + Learning + + +
+ The lifelong learning paradigm in machine learning is an attractive +alternative to the more prominent isolated learning scheme not only due to its +resemblance to biological learning but also its potential to reduce energy +waste by obviating excessive model re-training. A key challenge to this +paradigm is the phenomenon of catastrophic forgetting. With the increasing +popularity and success of pre-trained models in machine learning, we pose the +question: What role does pre-training play in lifelong learning, specifically +with respect to catastrophic forgetting? We investigate existing methods in the +context of large, pre-trained models and evaluate their performance on a +variety of text and image classification tasks, including a large-scale study +using a novel data set of 15 diverse NLP tasks. Across all settings, we observe +that generic pre-training implicitly alleviates the effects of catastrophic +forgetting when learning multiple tasks sequentially compared to randomly +initialized models. We then further investigate why pre-training alleviates +forgetting in this setting. We study this phenomenon by analyzing the loss +landscape, finding that pre-trained weights appear to ease forgetting by +leading to wider minima. Based on this insight, we propose jointly optimizing +for current task loss and loss basin sharpness to explicitly encourage wider +basins during sequential fine-tuning. We show that this optimization approach +outperforms several state-of-the-art task-sequential continual learning +algorithms across multiple settings, occasionally even without retaining a +memory that scales in size with the number of tasks. + +
+
+
+
+
+ + ♻ ☆ Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model + + +
+ Sentiment analysis is the process of identifying and categorizing people's +emotions or opinions regarding various topics. The analysis of Twitter +sentiment has become an increasingly popular topic in recent years. In this +paper, we present several machine learning and a deep learning model to +analysis sentiment of Persian political tweets. Our analysis was conducted +using Bag of Words and ParsBERT for word representation. We applied Gaussian +Naive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random +Forests, as well as a combination of CNN and LSTM to classify the polarities of +tweets. The results of this study indicate that deep learning with ParsBERT +embedding performs better than machine learning. The CNN-LSTM model had the +highest classification accuracy with 89 percent on the first dataset and 71 +percent on the second dataset. Due to the complexity of Persian, it was a +difficult task to achieve this level of efficiency. The main objective of our +research was to reduce the training time while maintaining the model's +performance. As a result, several adjustments were made to the model +architecture and parameters. In addition to achieving the objective, the +performance was slightly improved as well. + +
+
+
+
+
+ + ♻ ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ♻ ☆ Challenges of GPT-3-based Conversational Agents for Healthcare + + +
+ The potential to provide patients with faster information access while +allowing medical specialists to concentrate on critical tasks makes medical +domain dialog agents appealing. However, the integration of large-language +models (LLMs) into these agents presents certain limitations that may result in +serious consequences. This paper investigates the challenges and risks of using +GPT-3-based models for medical question-answering (MedQA). We perform several +evaluations contextualized in terms of standard medical principles. We provide +a procedure for manually designing patient queries to stress-test high-risk +limitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to +respond adequately to these queries, generating erroneous medical information, +unsafe recommendations, and content that may be considered offensive. + +
+
+ comment: 12 pages, 9 Tables, accepted to RANLP 2023 +
+
+
+
+
+ + ♻ ☆ OLISIA: a Cascade System for Spoken Dialogue State Tracking + + +
+ Though Dialogue State Tracking (DST) is a core component of spoken dialogue +systems, recent work on this task mostly deals with chat corpora, disregarding +the discrepancies between spoken and written language.In this paper, we propose +OLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR) +model and a DST model. We introduce several adaptations in the ASR and DST +modules to improve integration and robustness to spoken conversations.With +these adaptations, our system ranked first in DSTC11 Track 3, a benchmark to +evaluate spoken DST. We conduct an in-depth analysis of the results and find +that normalizing the ASR outputs and adapting the DST inputs through data +augmentation, along with increasing the pre-trained models size all play an +important role in reducing the performance discrepancy between written and +spoken conversations. + +
+
+
+
+
+ + ♻ ☆ Theory of Mind Might Have Spontaneously Emerged in Large Language Models + + +
+ We explore the intriguing possibility that theory of mind (ToM), or the +uniquely human ability to impute unobservable mental states to others, might +have spontaneously emerged in large language models (LLMs). We designed 40 +false-belief tasks, considered a gold standard in testing ToM in humans, and +administered them to several LLMs. Each task included a false-belief scenario, +three closely matched true-belief controls, and the reversed versions of all +four. Smaller and older models solved no tasks; GPT-3-davinci-001 (from May +2020) and GPT-3-davinci-002 (from January 2022) solved 10%; and +GPT-3-davinci-003 (from November 2022) and ChatGPT-3.5-turbo (from March 2023) +solved 35% of the tasks, mirroring the performance of three-year-old children. +ChatGPT-4 (from June 2023) solved 90% of the tasks, matching the performance of +seven-year-old children. These findings suggest the intriguing possibility that +ToM, previously considered exclusive to humans, may have spontaneously emerged +as a byproduct of LLMs' improving language skills. + +
+
+ comment: TRY RUNNING ToM EXPERIMENTS ON YOUR OWN: The code and tasks used in + this study are available at Colab + (https://colab.research.google.com/drive/1ZRtmw87CdA4xp24DNS_Ik_uA2ypaRnoU). + Don't worry if you are not an expert coder, you should be able to run this + code with no-to-minimum Python skills. Or copy-paste the tasks to ChatGPT's + web interface +
+
+
+
+
+ + ♻ ☆ Cross-Lingual Constituency Parsing for Middle High German: A + Delexicalized Approach + + +
+ Constituency parsing plays a fundamental role in advancing natural language +processing (NLP) tasks. However, training an automatic syntactic analysis +system for ancient languages solely relying on annotated parse data is a +formidable task due to the inherent challenges in building treebanks for such +languages. It demands extensive linguistic expertise, leading to a scarcity of +available resources. To overcome this hurdle, cross-lingual transfer techniques +which require minimal or even no annotated data for low-resource target +languages offer a promising solution. In this study, we focus on building a +constituency parser for $\mathbf{M}$iddle $\mathbf{H}$igh $\mathbf{G}$erman +($\mathbf{MHG}$) under realistic conditions, where no annotated MHG treebank is +available for training. In our approach, we leverage the linguistic continuity +and structural similarity between MHG and $\mathbf{M}$odern $\mathbf{G}$erman +($\mathbf{MG}$), along with the abundance of MG treebank resources. +Specifically, by employing the $\mathit{delexicalization}$ method, we train a +constituency parser on MG parse datasets and perform cross-lingual transfer to +MHG parsing. Our delexicalized constituency parser demonstrates remarkable +performance on the MHG test set, achieving an F1-score of 67.3%. It outperforms +the best zero-shot cross-lingual baseline by a margin of 28.6% points. These +encouraging results underscore the practicality and potential for automatic +syntactic analysis in other ancient languages that face similar challenges as +MHG. + +
+
+ comment: Accepted to ALP 2023 +
+
+
+
+
+ + ♻ ☆ A Trip Towards Fairness: Bias and De-Biasing in Large Language Models + + +
+ Cheap-to-Build Very Large-Language Models (CtB-LLMs) with affordable training +are emerging as the next big revolution in natural language processing and +understanding. These CtB-LLMs are democratizing access to trainable Very +Large-Language Models (VLLMs) and, thus, may represent the building blocks of +many NLP systems solving downstream tasks. Hence, a little or a large bias in +CtB-LLMs may cause huge harm. In this paper, we performed a large investigation +of the bias of three families of CtB-LLMs, and we showed that debiasing +techniques are effective and usable. Indeed, according to current tests, the +LLaMA and the OPT families have an important bias in gender, race, religion, +and profession. In contrast to the analysis for other LLMs, we discovered that +bias depends not on the number of parameters but on the perplexity. Finally, +the debiasing of OPT using LoRA reduces bias up to 4.12 points in the +normalized stereotype score. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption + overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet + results in Section 4.3 (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and + Addressing Sociological Implications + + +
+ Gender bias in artificial intelligence (AI) and natural language processing +has garnered significant attention due to its potential impact on societal +perceptions and biases. This research paper aims to analyze gender bias in +Large Language Models (LLMs) with a focus on multiple comparisons between GPT-2 +and GPT-3.5, some prominent language models, to better understand its +implications. Through a comprehensive literature review, the study examines +existing research on gender bias in AI language models and identifies gaps in +the current knowledge. The methodology involves collecting and preprocessing +data from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis +techniques to evaluate gender bias in the generated text. The findings shed +light on gendered word associations, language usage, and biased narratives +present in the outputs of these Large Language Models. The discussion explores +the ethical implications of gender bias and its potential consequences on +social perceptions and marginalized communities. Additionally, the paper +presents strategies for reducing gender bias in LLMs, including algorithmic +approaches and data augmentation techniques. The research highlights the +importance of interdisciplinary collaborations and the role of sociological +studies in mitigating gender bias in AI models. By addressing these issues, we +can pave the way for more inclusive and unbiased AI systems that have a +positive impact on society. + +
+
+
+
+
+ + ♻ ☆ NBIAS: A Natural Language Processing Framework for Bias Identification + in Text + + +
+ Bias in textual data can lead to skewed interpretations and outcomes when the +data is used. These biases could perpetuate stereotypes, discrimination, or +other forms of unfair treatment. An algorithm trained on biased data may end up +making decisions that disproportionately impact a certain group of people. +Therefore, it is crucial to detect and remove these biases to ensure the fair +and ethical use of data. To this end, we develop a comprehensive and robust +framework NBIAS that consists of four main layers: data, corpus construction, +model development and an evaluation layer. The dataset is constructed by +collecting diverse data from various domains, including social media, +healthcare, and job hiring portals. As such, we applied a transformer-based +token classification model that is able to identify bias words/ phrases through +a unique named entity BIAS. In the evaluation procedure, we incorporate a blend +of quantitative and qualitative measures to gauge the effectiveness of our +models. We achieve accuracy improvements ranging from 1% to 8% compared to +baselines. We are also able to generate a robust understanding of the model +functioning. The proposed approach is applicable to a variety of biases and +contributes to the fair and ethical use of textual data. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ a unified front-end framework for english text-to-speech synthesis + + +
+ The front-end is a critical component of English text-to-speech (TTS) +systems, responsible for extracting linguistic features that are essential for +a text-to-speech model to synthesize speech, such as prosodies and phonemes. +The English TTS front-end typically consists of a text normalization (TN) +module, a prosody word prosody phrase (PWPP) module, and a grapheme-to-phoneme +(G2P) module. However, current research on the English TTS front-end focuses +solely on individual modules, neglecting the interdependence between them and +resulting in sub-optimal performance for each module. Therefore, this paper +proposes a unified front-end framework that captures the dependencies among the +English TTS front-end modules. Extensive experiments have demonstrated that the +proposed method achieves state-of-the-art (SOTA) performance in all modules. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ On the Robustness of ChatGPT: An Adversarial and Out-of-distribution + Perspective ICLR 2023 + + +
+ ChatGPT is a recent chatbot service released by OpenAI and is receiving +increasing attention over the past few months. While evaluations of various +aspects of ChatGPT have been done, its robustness, i.e., the performance to +unexpected inputs, is still unclear to the public. Robustness is of particular +concern in responsible AI, especially for safety-critical applications. In this +paper, we conduct a thorough evaluation of the robustness of ChatGPT from the +adversarial and out-of-distribution (OOD) perspective. To do so, we employ the +AdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart +review and DDXPlus medical diagnosis datasets for OOD evaluation. We select +several popular foundation models as baselines. Results show that ChatGPT shows +consistent advantages on most adversarial and OOD classification and +translation tasks. However, the absolute performance is far from perfection, +which suggests that adversarial and OOD robustness remains a significant threat +to foundation models. Moreover, ChatGPT shows astounding performance in +understanding dialogue-related texts and we find that it tends to provide +informal suggestions for medical tasks instead of definitive answers. Finally, +we present in-depth discussions of possible research directions. + +
+
+ comment: Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable + Large-Scale Machine Learning Models; code is at: + https://github.com/microsoft/robustlearn; more works: + https://llm-eval.github.io/ +
+
+
+
+
+ + ♻ ☆ Block-State Transformer + + +
+ State space models (SSMs) have shown impressive results on tasks that require +modeling long-range dependencies and efficiently scale to long sequences owing +to their subquadratic runtime complexity. Originally designed for continuous +signals, SSMs have shown superior performance on a plethora of tasks, in vision +and audio; however, SSMs still lag Transformer performance in Language Modeling +tasks. In this work, we propose a hybrid layer named Block-State Transformer +(BST), that internally combines an SSM sublayer for long-range +contextualization, and a Block Transformer sublayer for short-term +representation of sequences. We study three different, and completely +parallelizable, variants that integrate SSMs and block-wise attention. We show +that our model outperforms similar Transformer-based architectures on language +modeling perplexity and generalizes to longer sequences. In addition, the +Block-State Transformer demonstrates more than tenfold increase in speed at the +layer level compared to the Block-Recurrent Transformer when model +parallelization is employed. + +
+
+
+
+
+ + ♻ ☆ Asymmetric feature interaction for interpreting model predictions ACL 2023 + + +
+ In natural language processing (NLP), deep neural networks (DNNs) could model +complex interactions between context and have achieved impressive results on a +range of NLP tasks. Prior works on feature interaction attribution mainly focus +on studying symmetric interaction that only explains the additional influence +of a set of words in combination, which fails to capture asymmetric influence +that contributes to model prediction. In this work, we propose an asymmetric +feature interaction attribution explanation model that aims to explore +asymmetric higher-order feature interactions in the inference of deep neural +NLP models. By representing our explanation with an directed interaction graph, +we experimentally demonstrate interpretability of the graph to discover +asymmetric feature interactions. Experimental results on two sentiment +classification datasets show the superiority of our model against the +state-of-the-art feature interaction attribution methods in identifying +influential features for model predictions. Our code is available at +https://github.com/StillLu/ASIV. + +
+
+ comment: Accepted by Findings of the Association for Computational + Linguistics: ACL 2023 (long paper) +
+
+
+
+
+ + ♻ ☆ Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language + Models + + +
+ Dense retrieval (DR) converts queries and documents into dense embeddings and +measures the similarity between queries and documents in vector space. One of +the challenges in DR is the lack of domain-specific training data. While DR +models can learn from large-scale public datasets like MS MARCO through +transfer learning, evidence shows that not all DR models and domains can +benefit from transfer learning equally. Recently, some researchers have +resorted to large language models (LLMs) to improve the zero-shot and few-shot +DR models. However, the hard prompts or human-written prompts utilized in these +works cannot guarantee the good quality of generated weak queries. To tackle +this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task, +we leverage soft prompt-tuning to optimize a task-specific soft prompt on +limited ground truth data and then prompt the LLMs to tag unlabeled documents +with weak queries, yielding enough weak document-query pairs to train +task-specific dense retrievers. We design a filter to select high-quality +example document-query pairs in the prompt to further improve the quality of +weak tagged queries. To the best of our knowledge, there is no prior work +utilizing soft prompt tuning to augment DR models. The experiments demonstrate +that SPTAR outperforms the unsupervised baselines BM25 and the recently +proposed LLMs-based augmentation method for DR. + +
+
+ comment: fix typos +
+
+
+
+
+ + ♻ ☆ Evaluating the Robustness to Instructions of Large Language Models + + +
+ Recently, Instruction fine-tuning has risen to prominence as a potential +method for enhancing the zero-shot capabilities of Large Language Models (LLMs) +on novel tasks. This technique has shown an exceptional ability to boost the +performance of moderately sized LLMs, sometimes even reaching performance +levels comparable to those of much larger model variants. The focus is on the +robustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an +exploration of six models including Alpaca, Vicuna, WizardLM, and Traditional +Task-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction +datasets as case studies. We carried out a comprehensive evaluation of these +instruction-following LLMs which have been tuned based on open-domain +instructions and task-oriented instructions. The main discussion is their +performance and robustness towards instructions. We have observed that in most +cases, the model's performance in dealing with unfamiliar instructions tends to +worsen significantly, and the robustness of the model for RE instructions +deteriorates compared to QA. Further, we discovered that up until a certain +parameter size threshold (3B), the performance of the FLAN-T5 model improves as +the parameter count increases. The robustness of different scales of FLAN-T5 +models to RE instruction is worse than the robustness to QA instruction. + +
+
+ comment: In our study, erroneous data analysis inadvertently led to misleading + outcomes. Incorrect variables were included, distorting results. This + emphasizes the significance of robust data processing and analysis techniques + in research +
+
+
+
+
+ + ♻ ☆ (QA)$^2$: Question Answering with Questionable Assumptions ACL 2023 + + +
+ Naturally occurring information-seeking questions often contain questionable +assumptions -- assumptions that are false or unverifiable. Questions containing +questionable assumptions are challenging because they require a distinct answer +strategy that deviates from typical answers for information-seeking questions. +For instance, the question "When did Marie Curie discover Uranium?" cannot be +answered as a typical "when" question without addressing the false assumption +"Marie Curie discovered Uranium". In this work, we propose (QA)$^2$ (Question +Answering with Questionable Assumptions), an open-domain evaluation dataset +consisting of naturally occurring search engine queries that may or may not +contain questionable assumptions. To be successful on (QA)$^2$, systems must be +able to detect questionable assumptions and also be able to produce adequate +responses for both typical information-seeking questions and ones with +questionable assumptions. Through human rater acceptability on end-to-end QA +with (QA)$^2$, we find that current models do struggle with handling +questionable assumptions, leaving substantial headroom for progress. + +
+
+ comment: ACL 2023 camera-ready +
+
+
+
+
+ + ♻ ☆ EntropyRank: Unsupervised Keyphrase Extraction via Side-Information + Optimization for Language Model-based Text Compression + + +
+ We propose an unsupervised method to extract keywords and keyphrases from +texts based on a pre-trained language model (LM) and Shannon's information +maximization. Specifically, our method extracts phrases having the highest +conditional entropy under the LM. The resulting set of keyphrases turns out to +solve a relevant information-theoretic problem: if provided as side +information, it leads to the expected minimal binary code length in compressing +the text using the LM and an entropy encoder. Alternately, the resulting set is +an approximation via a causal LM to the set of phrases that minimize the +entropy of the text when conditioned upon it. Empirically, the method provides +results comparable to the most commonly used methods in various keyphrase +extraction benchmark challenges. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 130 + +
+
+
+ + ☆ 3D Adversarial Augmentations for Robust Out-of-Domain Predictions + + +
+ Since real-world training datasets cannot properly sample the long tail of +the underlying data distribution, corner cases and rare out-of-domain samples +can severely hinder the performance of state-of-the-art models. This problem +becomes even more severe for dense tasks, such as 3D semantic segmentation, +where points of non-standard objects can be confidently associated to the wrong +class. In this work, we focus on improving the generalization to out-of-domain +data. We achieve this by augmenting the training set with adversarial examples. +First, we learn a set of vectors that deform the objects in an adversarial +fashion. To prevent the adversarial examples from being too far from the +existing data distribution, we preserve their plausibility through a series of +constraints, ensuring sensor-awareness and shapes smoothness. Then, we perform +adversarial augmentation by applying the learned sample-independent vectors to +the available objects when training a model. We conduct extensive experiments +across a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D +object detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D +semantic segmentation. Despite training on a standard single dataset, our +approach substantially improves the robustness and generalization of both 3D +object detection and 3D semantic segmentation methods to out-of-domain data. + +
+
+ comment: 37 pages, 12 figures +
+
+
+
+
+ + ☆ An Adaptive Tangent Feature Perspective of Neural Networks + + +
+ In order to better understand feature learning in neural networks, we propose +a framework for understanding linear models in tangent feature space where the +features are allowed to be transformed during training. We consider linear +transformations of features, resulting in a joint optimization over parameters +and transformations with a bilinear interpolation constraint. We show that this +optimization problem has an equivalent linearly constrained optimization with +structured regularization that encourages approximately low rank solutions. +Specializing to neural network structure, we gain insights into how the +features and thus the kernel function change, providing additional nuance to +the phenomenon of kernel alignment when the target function is poorly +represented using tangent features. In addition to verifying our theoretical +observations in real neural networks on a simple regression problem, we +empirically show that an adaptive feature implementation of tangent feature +classification has an order of magnitude lower sample complexity than the fixed +tangent feature model on MNIST and CIFAR-10. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ A General-Purpose Self-Supervised Model for Computational Pathology + + +
+ Tissue phenotyping is a fundamental computational pathology (CPath) task in +learning objective characterizations of histopathologic biomarkers in anatomic +pathology. However, whole-slide imaging (WSI) poses a complex computer vision +problem in which the large-scale image resolutions of WSIs and the enormous +diversity of morphological phenotypes preclude large-scale data annotation. +Current efforts have proposed using pretrained image encoders with either +transfer learning from natural image datasets or self-supervised pretraining on +publicly-available histopathology datasets, but have not been extensively +developed and evaluated across diverse tissue types at scale. We introduce UNI, +a general-purpose self-supervised model for pathology, pretrained using over +100 million tissue patches from over 100,000 diagnostic haematoxylin and +eosin-stained WSIs across 20 major tissue types, and evaluated on 33 +representative CPath clinical tasks in CPath of varying diagnostic +difficulties. In addition to outperforming previous state-of-the-art models, we +demonstrate new modeling capabilities in CPath such as resolution-agnostic +tissue classification, slide classification using few-shot class prototypes, +and disease subtyping generalization in classifying up to 108 cancer types in +the OncoTree code classification system. UNI advances unsupervised +representation learning at scale in CPath in terms of both pretraining data and +downstream evaluation, enabling data-efficient AI models that can generalize +and transfer to a gamut of diagnostically-challenging tasks and clinical +workflows in anatomic pathology. + +
+
+
+
+
+ + ☆ Learning Modulated Transformation in GANs + + +
+ The success of style-based generators largely benefits from style modulation, +which helps take care of the cross-instance variation within data. However, the +instance-wise stochasticity is typically introduced via regular convolution, +where kernels interact with features at some fixed locations, limiting its +capacity for modeling geometric variation. To alleviate this problem, we equip +the generator in generative adversarial networks (GANs) with a plug-and-play +module, termed as modulated transformation module (MTM). This module predicts +spatial offsets under the control of latent codes, based on which the +convolution operation can be applied at variable locations for different +instances, and hence offers the model an additional degree of freedom to handle +geometry deformation. Extensive experiments suggest that our approach can be +faithfully generalized to various generative tasks, including image generation, +3D-aware image synthesis, and video generation, and get compatible with +state-of-the-art frameworks without any hyper-parameter tuning. It is +noteworthy that, towards human generation on the challenging TaiChi dataset, we +improve the FID of StyleGAN3 from 21.36 to 13.60, demonstrating the efficacy of +learning modulated geometry transformation. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Multimodal Contrastive Learning and Tabular Attention for Automated + Alzheimer's Disease Prediction + + +
+ Alongside neuroimaging such as MRI scans and PET, Alzheimer's disease (AD) +datasets contain valuable tabular data including AD biomarkers and clinical +assessments. Existing computer vision approaches struggle to utilize this +additional information. To address these needs, we propose a generalizable +framework for multimodal contrastive learning of image data and tabular data, a +novel tabular attention module for amplifying and ranking salient features in +tables, and the application of these techniques onto Alzheimer's disease +prediction. Experimental evaulations demonstrate the strength of our framework +by detecting Alzheimer's disease (AD) from over 882 MR image slices from the +ADNI database. We take advantage of the high interpretability of tabular data +and our novel tabular attention approach and through attribution of the +attention scores for each row of the table, we note and rank the most +predominant features. Results show that the model is capable of an accuracy of +over 83.8%, almost a 10% increase from previous state of the art. + +
+
+
+
+
+ + ☆ Input margins can predict generalization too + + +
+ Understanding generalization in deep neural networks is an active area of +research. A promising avenue of exploration has been that of margin +measurements: the shortest distance to the decision boundary for a given sample +or its representation internal to the network. While margins have been shown to +be correlated with the generalization ability of a model when measured at its +hidden representations (hidden margins), no such link between large margins and +generalization has been established for input margins. We show that while input +margins are not generally predictive of generalization, they can be if the +search space is appropriately constrained. We develop such a measure based on +input margins, which we refer to as `constrained margins'. The predictive power +of this new measure is demonstrated on the 'Predicting Generalization in Deep +Learning' (PGDL) dataset and contrasted with hidden representation margins. We +find that constrained margins achieve highly competitive scores and outperform +other margin measurements in general. This provides a novel insight on the +relationship between generalization and classification margins, and highlights +the importance of considering the data manifold for investigations of +generalization in DNNs. + +
+
+
+
+
+ + ☆ Online Overexposed Pixels Hallucination in Videos with Adaptive + Reference Frame Selection + + +
+ Low dynamic range (LDR) cameras cannot deal with wide dynamic range inputs, +frequently leading to local overexposure issues. We present a learning-based +system to reduce these artifacts without resorting to complex acquisition +mechanisms like alternating exposures or costly processing that are typical of +high dynamic range (HDR) imaging. We propose a transformer-based deep neural +network (DNN) to infer the missing HDR details. In an ablation study, we show +the importance of using a multiscale DNN and train it with the proper cost +function to achieve state-of-the-art quality. To aid the reconstruction of the +overexposed areas, our DNN takes a reference frame from the past as an +additional input. This leverages the commonly occurring temporal instabilities +of autoexposure to our advantage: since well-exposed details in the current +frame may be overexposed in the future, we use reinforcement learning to train +a reference frame selection DNN that decides whether to adopt the current frame +as a future reference. Without resorting to alternating exposures, we obtain +therefore a causal, HDR hallucination algorithm with potential application in +common video acquisition settings. Our demo video can be found at +https://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view + +
+
+ comment: The demo video can be found at + https://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view +
+
+
+
+
+ + ☆ Canonical Factors for Hybrid Neural Fields ICCV 2023 + + +
+ Factored feature volumes offer a simple way to build more compact, efficient, +and intepretable neural fields, but also introduce biases that are not +necessarily beneficial for real-world data. In this work, we (1) characterize +the undesirable biases that these architectures have for axis-aligned signals +-- they can lead to radiance field reconstruction differences of as high as 2 +PSNR -- and (2) explore how learning a set of canonicalizing transformations +can improve representations by removing these biases. We prove in a +two-dimensional model problem that simultaneously learning these +transformations together with scene appearance succeeds with drastically +improved efficiency. We validate the resulting architectures, which we call +TILTED, using image, signed distance, and radiance field reconstruction tasks, +where we observe improvements across quality, robustness, compactness, and +runtime. Results demonstrate that TILTED can enable capabilities comparable to +baselines that are 2x larger, while highlighting weaknesses of neural field +evaluation procedures. + +
+
+ comment: ICCV 2023. Project webpage: https://brentyi.github.io/tilted/ +
+
+
+
+
+ + ☆ Pseudo-Boolean Polynomials Approach To Edge Detection And Image + Segmentation + + +
+ We introduce a deterministic approach to edge detection and image +segmentation by formulating pseudo-Boolean polynomials on image patches. The +approach works by applying a binary classification of blob and edge regions in +an image based on the degrees of pseudo-Boolean polynomials calculated on +patches extracted from the provided image. We test our method on simple images +containing primitive shapes of constant and contrasting colour and establish +the feasibility before applying it to complex instances like aerial landscape +images. The proposed method is based on the exploitation of the reduction, +polynomial degree, and equivalence properties of penalty-based pseudo-Boolean +polynomials. + +
+
+ comment: 14 pages, 8 figures, submitted to the International Conference Data + Analysis, Optimization and Their Applications on the Occasion of Boris + Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region, + Moscow Institute of Physics and Technology + https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php +
+
+
+
+
+ + ☆ Complementing Onboard Sensors with Satellite Map: A New Perspective for + HD Map Construction + + +
+ High-Definition (HD) maps play a crucial role in autonomous driving systems. +Recent methods have attempted to construct HD maps in real-time based on +information obtained from vehicle onboard sensors. However, the performance of +these methods is significantly susceptible to the environment surrounding the +vehicle due to the inherent limitation of onboard sensors, such as weak +capacity for long-range detection. In this study, we demonstrate that +supplementing onboard sensors with satellite maps can enhance the performance +of HD map construction methods, leveraging the broad coverage capability of +satellite maps. For the purpose of further research, we release the satellite +map tiles as a complementary dataset of nuScenes dataset. Meanwhile, we propose +a hierarchical fusion module that enables better fusion of satellite maps +information with existing methods. Specifically, we design an attention mask +based on segmentation and distance, applying the cross-attention mechanism to +fuse onboard Bird's Eye View (BEV) features and satellite features in +feature-level fusion. An alignment module is introduced before concatenation in +BEV-level fusion to mitigate the impact of misalignment between the two +features. The experimental results on the augmented nuScenes dataset showcase +the seamless integration of our module into three existing HD map construction +methods. It notably enhances their performance in both HD map semantic +segmentation and instance detection tasks. + +
+
+
+
+
+ + ☆ WrappingNet: Mesh Autoencoder via Deep Sphere Deformation + + +
+ There have been recent efforts to learn more meaningful representations via +fixed length codewords from mesh data, since a mesh serves as a complete model +of underlying 3D shape compared to a point cloud. However, the mesh +connectivity presents new difficulties when constructing a deep learning +pipeline for meshes. Previous mesh unsupervised learning approaches typically +assume category-specific templates, e.g., human face/body templates. It +restricts the learned latent codes to only be meaningful for objects in a +specific category, so the learned latent spaces are unable to be used across +different types of objects. In this work, we present WrappingNet, the first +mesh autoencoder enabling general mesh unsupervised learning over heterogeneous +objects. It introduces a novel base graph in the bottleneck dedicated to +representing mesh connectivity, which is shown to facilitate learning a shared +latent space representing object shape. The superiority of WrappingNet mesh +learning is further demonstrated via improved reconstruction quality and +competitive classification compared to point cloud learning, as well as latent +interpolation between meshes of different categories. + +
+
+
+
+
+ + ☆ Robust Long-Tailed Learning via Label-Aware Bounded CVaR + + +
+ Data in the real-world classification problems are always imbalanced or +long-tailed, wherein the majority classes have the most of the samples that +dominate the model training. In such setting, the naive model tends to have +poor performance on the minority classes. Previously, a variety of loss +modifications have been proposed to address the long-tailed leaning problem, +while these methods either treat the samples in the same class +indiscriminatingly or lack a theoretical guarantee. In this paper, we propose +two novel approaches based on CVaR (Conditional Value at Risk) to improve the +performance of long-tailed learning with a solid theoretical ground. +Specifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss +to overcome the pessimistic result of the original CVaR, and further design the +optimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we +additionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to +stabilize the optimization process, where we also offer the theoretical +support. Extensive experiments on real-world datasets with long-tailed label +distributions verify the superiority of our proposed methods. + +
+
+
+
+
+ + ☆ Color Aesthetics: Fuzzy based User-driven Method for Harmony and + Preference Prediction SC + + +
+ Color is the most important intrinsic sensory feature that has a powerful +impact on product sales. Color is even responsible for raising the aesthetic +senses in our brains. Account for individual differences is crucial in color +aesthetics. It requires user-driven mechanisms for various e-commerce +applications. We propose a method for quantitative evaluation of all types of +perceptual responses to color(s): distinct color preference, color harmony, and +color combination preference. Preference for color schemes can be predicted by +combining preferences for the basic colors and ratings of color harmony. +Harmonious pallets are extracted from big data set using comparison algorithms +based on fuzzy similarity and grouping. The proposed model results in useful +predictions of harmony and preference of multicolored images. For example, in +the context of apparel coordination, it allows predicting a preference for a +look based on clothing colors. Our approach differs from standard aesthetic +models, since in accounts for a personal variation. In addition, it can process +not only lower-order color pairs, but also groups of several colors. + +
+
+ comment: It was accepted as a short paper. IFSA-SCIS 2017 Conference held in + Otsu, Japan +
+
+
+
+
+ + ☆ Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation + and Diagnosis + + +
+ Thyroid nodule segmentation is a crucial step in the diagnostic procedure of +physicians and computer-aided diagnosis systems. Mostly, current studies treat +segmentation and diagnosis as independent tasks without considering the +correlation between these tasks. The sequence steps of these independent tasks +in computer-aided diagnosis systems may lead to the accumulation of errors. +Therefore, it is worth combining them as a whole through exploring the +relationship between thyroid nodule segmentation and diagnosis. According to +the thyroid imaging reporting and data system (TI-RADS), the assessment of +shape and margin characteristics is the prerequisite for the discrimination of +benign and malignant thyroid nodules. These characteristics can be observed in +the thyroid nodule segmentation masks. Inspired by the diagnostic procedure of +TI-RADS, this paper proposes a shape-margin knowledge augmented network +(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to +the similarity in visual features between segmentation and diagnosis, SkaNet +shares visual features in the feature extraction stage and then utilizes a +dual-branch architecture to perform thyroid nodule segmentation and diagnosis +tasks simultaneously. To enhance effective discriminative features, an +exponential mixture module is devised, which incorporates convolutional feature +maps and self-attention maps by exponential weighting. Then, SkaNet is jointly +optimized by a knowledge augmented multi-task loss function with a constraint +penalty term. It embeds shape and margin characteristics through numerical +computation and models the relationship between the thyroid nodule diagnosis +results and segmentation masks. + +
+
+
+
+
+ + ☆ On the Robustness of Object Detection Models in Aerial Images + + +
+ The robustness of object detection models is a major concern when applied to +real-world scenarios. However, the performance of most object detection models +degrades when applied to images subjected to corruptions, since they are +usually trained and evaluated on clean datasets. Enhancing the robustness of +object detection models is of utmost importance, especially for those designed +for aerial images, which feature complex backgrounds, substantial variations in +scales and orientations of objects. This paper addresses the challenge of +assessing the robustness of object detection models in aerial images, with a +specific emphasis on scenarios where images are affected by clouds. In this +study, we introduce two novel benchmarks based on DOTA-v1.0. The first +benchmark encompasses 19 prevalent corruptions, while the second focuses on +cloud-corrupted images-a phenomenon uncommon in natural pictures yet frequent +in aerial photography. We systematically evaluate the robustness of mainstream +object detection models and perform numerous ablation experiments. Through our +investigations, we find that enhanced model architectures, larger networks, +well-crafted modules, and judicious data augmentation strategies collectively +enhance the robustness of aerial object detection models. The benchmarks we +propose and our comprehensive experimental analyses can facilitate research on +robust object detection in aerial images. Codes and datasets are available at: +(https://github.com/hehaodong530/DOTA-C) + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Efficient Model Personalization in Federated Learning via + Client-Specific Prompt Generation ICCV 2023 + + +
+ Federated learning (FL) emerges as a decentralized learning framework which +trains models from multiple distributed clients without sharing their data to +preserve privacy. Recently, large-scale pre-trained models (e.g., Vision +Transformer) have shown a strong capability of deriving robust representations. +However, the data heterogeneity among clients, the limited computation +resources, and the communication bandwidth restrict the deployment of +large-scale models in FL frameworks. To leverage robust representations from +large-scale models while enabling efficient model personalization for +heterogeneous clients, we propose a novel personalized FL framework of +client-specific Prompt Generation (pFedPG), which learns to deploy a +personalized prompt generator at the server for producing client-specific +visual prompts that efficiently adapts frozen backbones to local data +distributions. Our proposed framework jointly optimizes the stages of +personalized prompt adaptation locally and personalized prompt generation +globally. The former aims to train visual prompts that adapt foundation models +to each client, while the latter observes local optimization directions to +generate personalized prompts for all clients. Through extensive experiments on +benchmark datasets, we show that our pFedPG is favorable against +state-of-the-art personalized FL methods under various types of data +heterogeneity, allowing computation and communication efficient model +personalization. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ AnomalyGPT: Detecting Industrial Anomalies using Large Vision-Language + Models + + +
+ Large Vision-Language Models (LVLMs) such as MiniGPT-4 and LLaVA have +demonstrated the capability of understanding images and achieved remarkable +performance in various visual tasks. Despite their strong abilities in +recognizing common objects due to extensive training datasets, they lack +specific domain knowledge and have a weaker understanding of localized details +within objects, which hinders their effectiveness in the Industrial Anomaly +Detection (IAD) task. On the other hand, most existing IAD methods only provide +anomaly scores and necessitate the manual setting of thresholds to distinguish +between normal and abnormal samples, which restricts their practical +implementation. In this paper, we explore the utilization of LVLM to address +the IAD problem and propose AnomalyGPT, a novel IAD approach based on LVLM. We +generate training data by simulating anomalous images and producing +corresponding textual descriptions for each image. We also employ an image +decoder to provide fine-grained semantic and design a prompt learner to +fine-tune the LVLM using prompt embeddings. Our AnomalyGPT eliminates the need +for manual threshold adjustments, thus directly assesses the presence and +locations of anomalies. Additionally, AnomalyGPT supports multi-turn dialogues +and exhibits impressive few-shot in-context learning capabilities. With only +one normal shot, AnomalyGPT achieves the state-of-the-art performance with an +accuracy of 86.1%, an image-level AUC of 94.1%, and a pixel-level AUC of 95.3% +on the MVTec-AD dataset. Code is available at +https://github.com/CASIA-IVA-Lab/AnomalyGPT. + +
+
+
+
+
+ + ☆ Ego-Motion Estimation and Dynamic Motion Separation from 3D Point Clouds + for Accumulating Data and Improving 3D Object Detection + + +
+ New 3+1D high-resolution radar sensors are gaining importance for 3D object +detection in the automotive domain due to their relative affordability and +improved detection compared to classic low-resolution radar sensors. One +limitation of high-resolution radar sensors, compared to lidar sensors, is the +sparsity of the generated point cloud. This sparsity could be partially +overcome by accumulating radar point clouds of subsequent time steps. This +contribution analyzes limitations of accumulating radar point clouds on the +View-of-Delft dataset. By employing different ego-motion estimation approaches, +the dataset's inherent constraints, and possible solutions are analyzed. +Additionally, a learning-based instance motion estimation approach is deployed +to investigate the influence of dynamic motion on the accumulated point cloud +for object detection. Experiments document an improved object detection +performance by applying an ego-motion estimation and dynamic motion correction +approach. + +
+
+ comment: Published at: AmE 2023 - Automotive meets Electronics; 14. GMM + Symposium (https://ieeexplore.ieee.org/document/10227711) +
+
+
+
+
+ + ☆ Detect, Augment, Compose, and Adapt: Four Steps for Unsupervised Domain + Adaptation in Object Detection + + +
+ Unsupervised domain adaptation (UDA) plays a crucial role in object detection +when adapting a source-trained detector to a target domain without annotated +data. In this paper, we propose a novel and effective four-step UDA approach +that leverages self-supervision and trains source and target data concurrently. +We harness self-supervised learning to mitigate the lack of ground truth in the +target domain. Our method consists of the following steps: (1) identify the +region with the highest-confidence set of detections in each target image, +which serve as our pseudo-labels; (2) crop the identified region and generate a +collection of its augmented versions; (3) combine these latter into a composite +image; (4) adapt the network to the target domain using the composed image. +Through extensive experiments under cross-camera, cross-weather, and +synthetic-to-real scenarios, our approach achieves state-of-the-art +performance, improving upon the nearest competitor by more than 2% in terms of +mean Average Precision (mAP). The code is available at +https://github.com/MohamedTEV/DACA. + +
+
+
+
+
+ + ☆ Enhancing Mobile Face Anti-Spoofing: A Robust Framework for Diverse + Attack Types under Screen Flash + + +
+ Face anti-spoofing (FAS) is crucial for securing face recognition systems. +However, existing FAS methods with handcrafted binary or pixel-wise labels have +limitations due to diverse presentation attacks (PAs). In this paper, we +propose an attack type robust face anti-spoofing framework under light flash, +called ATR-FAS. Due to imaging differences caused by various attack types, +traditional FAS methods based on single binary classification network may +result in excessive intra-class distance of spoof faces, leading to a challenge +of decision boundary learning. Therefore, we employed multiple networks to +reconstruct multi-frame depth maps as auxiliary supervision, and each network +experts in one type of attack. A dual gate module (DGM) consisting of a type +gate and a frame-attention gate is introduced, which perform attack type +recognition and multi-frame attention generation, respectively. The outputs of +DGM are utilized as weight to mix the result of multiple expert networks. The +multi-experts mixture enables ATR-FAS to generate spoof-differentiated depth +maps, and stably detects spoof faces without being affected by different types +of PAs. Moreover, we design a differential normalization procedure to convert +original flash frames into differential frames. This simple but effective +processing enhances the details in flash frames, aiding in the generation of +depth maps. To verify the effectiveness of our framework, we collected a +large-scale dataset containing 12,660 live and spoof videos with diverse PAs +under dynamic flash from the smartphone screen. Extensive experiments +illustrate that the proposed ATR-FAS significantly outperforms existing +state-of-the-art methods. The code and dataset will be available at +https://github.com/Chaochao-Lin/ATR-FAS. + +
+
+
+
+
+ + ☆ IndGIC: Supervised Action Recognition under Low Illumination + + +
+ Technologies of human action recognition in the dark are gaining more and +more attention as huge demand in surveillance, motion control and +human-computer interaction. However, because of limitation in image enhancement +method and low-lighting video datasets, e.g. labeling cost, existing methods +meet some problems. Some video-based approached are effect and efficient in +specific datasets but cannot generalize to most cases while others methods +using multiple sensors rely heavily to prior knowledge to deal with noisy +nature from video stream. In this paper, we proposes action recognition method +using deep multi-input network. Furthermore, we proposed a Independent Gamma +Intensity Corretion (Ind-GIC) to enhance poor-illumination video, generating +one gamma for one frame to increase enhancement performance. To prove our +method is effective, there is some evaluation and comparison between our method +and existing methods. Experimental results show that our model achieves high +accuracy in on ARID dataset. + +
+
+
+
+
+ + ☆ Imperceptible Adversarial Attack on Deep Neural Networks from Image + Boundary + + +
+ Although Deep Neural Networks (DNNs), such as the convolutional neural +networks (CNN) and Vision Transformers (ViTs), have been successfully applied +in the field of computer vision, they are demonstrated to be vulnerable to +well-sought Adversarial Examples (AEs) that can easily fool the DNNs. The +research in AEs has been active, and many adversarial attacks and explanations +have been proposed since they were discovered in 2014. The mystery of the AE's +existence is still an open question, and many studies suggest that DNN training +algorithms have blind spots. The salient objects usually do not overlap with +boundaries; hence, the boundaries are not the DNN model's attention. +Nevertheless, recent studies show that the boundaries can dominate the behavior +of the DNN models. Hence, this study aims to look at the AEs from a different +perspective and proposes an imperceptible adversarial attack that systemically +attacks the input image boundary for finding the AEs. The experimental results +have shown that the proposed boundary attacking method effectively attacks six +CNN models and the ViT using only 32% of the input image content (from the +boundaries) with an average success rate (SR) of 95.2% and an average peak +signal-to-noise ratio of 41.37 dB. Correlation analyses are conducted, +including the relation between the adversarial boundary's width and the SR and +how the adversarial boundary changes the DNN model's attention. This paper's +discoveries can potentially advance the understanding of AEs and provide a +different perspective on how AEs can be constructed. + +
+
+
+
+
+ + ☆ Enhancing Robot Learning through Learned Human-Attention Feature Maps ICRA 2023 + + +
+ Robust and efficient learning remains a challenging problem in robotics, in +particular with complex visual inputs. Inspired by human attention mechanism, +with which we quickly process complex visual scenes and react to changes in the +environment, we think that embedding auxiliary information about focus point +into robot learning would enhance efficiency and robustness of the learning +process. In this paper, we propose a novel approach to model and emulate the +human attention with an approximate prediction model. We then leverage this +output and feed it as a structured auxiliary feature map into downstream +learning tasks. We validate this idea by learning a prediction model from +human-gaze recordings of manual driving in the real world. We test our approach +on two learning tasks - object detection and imitation learning. Our +experiments demonstrate that the inclusion of predicted human attention leads +to improved robustness of the trained models to out-of-distribution samples and +faster learning in low-data regime settings. Our work highlights the potential +of incorporating structured auxiliary information in representation learning +for robotics and opens up new avenues for research in this direction. All code +and data are available online. + +
+
+ comment: This work has been accepted for the RAP4Robots workshop at ICRA 2023 + in London +
+
+
+
+
+ + ☆ Occlusion-Aware Deep Convolutional Neural Network via Homogeneous + Tanh-transforms for Face Parsing + + +
+ Face parsing infers a pixel-wise label map for each semantic facial +component. Previous methods generally work well for uncovered faces, however +overlook the facial occlusion and ignore some contextual area outside a single +face, especially when facial occlusion has become a common situation during the +COVID-19 epidemic. Inspired by the illumination theory of image, we propose a +novel homogeneous tanh-transforms for image preprocessing, which made up of +four tanh-transforms, that fuse the central vision and the peripheral vision +together. Our proposed method addresses the dilemma of face parsing under +occlusion and compresses more information of surrounding context. Based on +homogeneous tanh-transforms, we propose an occlusion-aware convolutional neural +network for occluded face parsing. It combines the information both in +Tanh-polar space and Tanh-Cartesian space, capable of enhancing receptive +fields. Furthermore, we introduce an occlusion-aware loss to focus on the +boundaries of occluded regions. The network is simple and flexible, and can be +trained end-to-end. To facilitate future research of occluded face parsing, we +also contribute a new cleaned face parsing dataset, which is manually purified +from several academic or industrial datasets, including CelebAMask-HQ, +Short-video Face Parsing as well as Helen dataset and will make it public. +Experiments demonstrate that our method surpasses state-of-art methods of face +parsing under occlusion. + +
+
+
+
+
+ + ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ☆ 3D-MuPPET: 3D Multi-Pigeon Pose Estimation and Tracking + + +
+ Markerless methods for animal posture tracking have been developing recently, +but frameworks and benchmarks for tracking large animal groups in 3D are still +lacking. To overcome this gap in the literature, we present 3D-MuPPET, a +framework to estimate and track 3D poses of up to 10 pigeons at interactive +speed using multiple-views. We train a pose estimator to infer 2D keypoints and +bounding boxes of multiple pigeons, then triangulate the keypoints to 3D. For +correspondence matching, we first dynamically match 2D detections to global +identities in the first frame, then use a 2D tracker to maintain +correspondences accross views in subsequent frames. We achieve comparable +accuracy to a state of the art 3D pose estimator for Root Mean Square Error +(RMSE) and Percentage of Correct Keypoints (PCK). We also showcase a novel use +case where our model trained with data of single pigeons provides comparable +results on data containing multiple pigeons. This can simplify the domain shift +to new species because annotating single animal data is less labour intensive +than multi-animal data. Additionally, we benchmark the inference speed of +3D-MuPPET, with up to 10 fps in 2D and 1.5 fps in 3D, and perform quantitative +tracking evaluation, which yields encouraging results. Finally, we show that +3D-MuPPET also works in natural environments without model fine-tuning on +additional annotations. To the best of our knowledge we are the first to +present a framework for 2D/3D posture and trajectory tracking that works in +both indoor and outdoor environments. + +
+
+
+
+
+ + ☆ Spatio-temporal MLP-graph network for 3D human pose estimation + + +
+ Graph convolutional networks and their variants have shown significant +promise in 3D human pose estimation. Despite their success, most of these +methods only consider spatial correlations between body joints and do not take +into account temporal correlations, thereby limiting their ability to capture +relationships in the presence of occlusions and inherent ambiguity. To address +this potential weakness, we propose a spatio-temporal network architecture +composed of a joint-mixing multi-layer perceptron block that facilitates +communication among different joints and a graph weighted Jacobi network block +that enables communication among various feature channels. The major novelty of +our approach lies in a new weighted Jacobi feature propagation rule obtained +through graph filtering with implicit fairing. We leverage temporal information +from the 2D pose sequences, and integrate weight modulation into the model to +enable untangling of the feature transformations of distinct nodes. We also +employ adjacency modulation with the aim of learning meaningful correlations +beyond defined linkages between body joints by altering the graph topology +through a learnable modulation matrix. Extensive experiments on two benchmark +datasets demonstrate the effectiveness of our model, outperforming recent +state-of-the-art methods for 3D human pose estimation. + +
+
+
+
+
+ + ☆ MSFlow: Multi-Scale Flow-based Framework for Unsupervised Anomaly + Detection + + +
+ Unsupervised anomaly detection (UAD) attracts a lot of research interest and +drives widespread applications, where only anomaly-free samples are available +for training. Some UAD applications intend to further locate the anomalous +regions without any anomaly information. + Although the absence of anomalous samples and annotations deteriorates the +UAD performance, an inconspicuous yet powerful statistics model, the +normalizing flows, is appropriate for anomaly detection and localization in an +unsupervised fashion. The flow-based probabilistic models, only trained on +anomaly-free data, can efficiently distinguish unpredictable anomalies by +assigning them much lower likelihoods than normal data. + Nevertheless, the size variation of unpredictable anomalies introduces +another inconvenience to the flow-based methods for high-precision anomaly +detection and localization. To generalize the anomaly size variation, we +propose a novel Multi-Scale Flow-based framework dubbed MSFlow composed of +asymmetrical parallel flows followed by a fusion flow to exchange multi-scale +perceptions. Moreover, different multi-scale aggregation strategies are adopted +for image-wise anomaly detection and pixel-wise anomaly localization according +to the discrepancy between them. The proposed MSFlow is evaluated on three +anomaly detection datasets, significantly outperforming existing methods. +Notably, on the challenging MVTec AD benchmark, our MSFlow achieves a new +state-of-the-art with a detection AUORC score of up to 99.7%, localization +AUCROC score of 98.8%, and PRO score of 97.1%. The reproducible code is +available at https://github.com/cool-xuan/msflow. + +
+
+
+
+
+ + ☆ ARTxAI: Explainable Artificial Intelligence Curates Deep Representation + Learning for Artistic Images using Fuzzy Techniques + + +
+ Automatic art analysis employs different image processing techniques to +classify and categorize works of art. When working with artistic images, we +need to take into account further considerations compared to classical image +processing. This is because such artistic paintings change drastically +depending on the author, the scene depicted, and their artistic style. This can +result in features that perform very well in a given task but do not grasp the +whole of the visual and symbolic information contained in a painting. In this +paper, we show how the features obtained from different tasks in artistic image +classification are suitable to solve other ones of similar nature. We present +different methods to improve the generalization capabilities and performance of +artistic classification systems. Furthermore, we propose an explainable +artificial intelligence method to map known visual traits of an image with the +features used by the deep learning model considering fuzzy rules. These rules +show the patterns and variables that are relevant to solve each task and how +effective is each of the patterns found. Our results show that our proposed +context-aware features can achieve up to $6\%$ and $26\%$ more accurate results +than other context- and non-context-aware solutions, respectively, depending on +the specific task. We also show that some of the features used by these models +can be more clearly correlated to visual traits in the original image than +others. + +
+
+
+
+
+ + ☆ ADFA: Attention-augmented Differentiable top-k Feature Adaptation for + Unsupervised Medical Anomaly Detection + + +
+ The scarcity of annotated data, particularly for rare diseases, limits the +variability of training data and the range of detectable lesions, presenting a +significant challenge for supervised anomaly detection in medical imaging. To +solve this problem, we propose a novel unsupervised method for medical image +anomaly detection: Attention-Augmented Differentiable top-k Feature Adaptation +(ADFA). The method utilizes Wide-ResNet50-2 (WR50) network pre-trained on +ImageNet to extract initial feature representations. To reduce the channel +dimensionality while preserving relevant channel information, we employ an +attention-augmented patch descriptor on the extracted features. We then apply +differentiable top-k feature adaptation to train the patch descriptor, mapping +the extracted feature representations to a new vector space, enabling effective +detection of anomalies. Experiments show that ADFA outperforms state-of-the-art +(SOTA) methods on multiple challenging medical image datasets, confirming its +effectiveness in medical anomaly detection. + +
+
+
+
+
+ + ☆ Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification + with Cross-Modal Retrieval + + +
+ Contrastive language-image pre-training (CLIP) has demonstrated remarkable +zero-shot classification ability, namely image classification using novel text +labels. Existing works have attempted to enhance CLIP by fine-tuning on +downstream tasks, but these have inadvertently led to performance degradation +on unseen classes, thus harming zero-shot generalization. This paper aims to +address this challenge by leveraging readily available image-text pairs from an +external dataset for cross-modal guidance during inference. To this end, we +propose X-MoRe, a novel inference method comprising two key steps: (1) +cross-modal retrieval and (2) modal-confidence-based ensemble. Given a query +image, we harness the power of CLIP's cross-modal representations to retrieve +relevant textual information from an external image-text pair dataset. Then, we +assign higher weights to the more reliable modality between the original query +image and retrieved text, contributing to the final prediction. X-MoRe +demonstrates robust performance across a diverse set of tasks without the need +for additional training, showcasing the effectiveness of utilizing cross-modal +features to maximize CLIP's zero-shot ability. + +
+
+
+
+
+ + ☆ NOVIS: A Case for End-to-End Near-Online Video Instance Segmentation + + +
+ Until recently, the Video Instance Segmentation (VIS) community operated +under the common belief that offline methods are generally superior to a frame +by frame online processing. However, the recent success of online methods +questions this belief, in particular, for challenging and long video sequences. +We understand this work as a rebuttal of those recent observations and an +appeal to the community to focus on dedicated near-online VIS approaches. To +support our argument, we present a detailed analysis on different processing +paradigms and the new end-to-end trainable NOVIS (Near-Online Video Instance +Segmentation) method. Our transformer-based model directly predicts +spatio-temporal mask volumes for clips of frames and performs instance tracking +between clips via overlap embeddings. NOVIS represents the first near-online +VIS approach which avoids any handcrafted tracking heuristics. We outperform +all existing VIS methods by large margins and provide new state-of-the-art +results on both YouTube-VIS (2019/2021) and the OVIS benchmarks. + +
+
+
+
+
+ + ☆ Enhancing OCR Performance through Post-OCR Models: Adopting Glyph + Embedding for Improved Correction + + +
+ The study investigates the potential of post-OCR models to overcome +limitations in OCR models and explores the impact of incorporating glyph +embedding on post-OCR correction performance. In this study, we have developed +our own post-OCR correction model. The novelty of our approach lies in +embedding the OCR output using CharBERT and our unique embedding technique, +capturing the visual characteristics of characters. Our findings show that +post-OCR correction effectively addresses deficiencies in inferior OCR models, +and glyph embedding enables the model to achieve superior results, including +the ability to correct individual words. + +
+
+
+
+
+ + ☆ Rotation Augmented Distillation for Exemplar-Free Class Incremental + Learning with Detailed Analysis + + +
+ Class incremental learning (CIL) aims to recognize both the old and new +classes along the increment tasks. Deep neural networks in CIL suffer from +catastrophic forgetting and some approaches rely on saving exemplars from +previous tasks, known as the exemplar-based setting, to alleviate this problem. +On the contrary, this paper focuses on the Exemplar-Free setting with no old +class sample preserved. Balancing the plasticity and stability in deep feature +learning with only supervision from new classes is more challenging. Most +existing Exemplar-Free CIL methods report the overall performance only and lack +further analysis. In this work, different methods are examined with +complementary metrics in greater detail. Moreover, we propose a simple CIL +method, Rotation Augmented Distillation (RAD), which achieves one of the +top-tier performances under the Exemplar-Free setting. Detailed analysis shows +our RAD benefits from the superior balance between plasticity and stability. +Finally, more challenging exemplar-free settings with fewer initial classes are +undertaken for further demonstrations and comparisons among the +state-of-the-art methods. + +
+
+
+
+
+ + ☆ CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for + Multimodal Machine Translation ICCV + + +
+ There has been a growing interest in developing multimodal machine +translation (MMT) systems that enhance neural machine translation (NMT) with +visual knowledge. This problem setup involves using images as auxiliary +information during training, and more recently, eliminating their use during +inference. Towards this end, previous works face a challenge in training +powerful MMT models from scratch due to the scarcity of annotated multilingual +vision-language data, especially for low-resource languages. Simultaneously, +there has been an influx of multilingual pre-trained models for NMT and +multimodal pre-trained models for vision-language tasks, primarily in English, +which have shown exceptional generalisation ability. However, these are not +directly applicable to MMT since they do not provide aligned multimodal +multilingual features for generative tasks. To alleviate this issue, instead of +designing complex modules for MMT, we propose CLIPTrans, which simply adapts +the independently pre-trained multimodal M-CLIP and the multilingual mBART. In +order to align their embedding spaces, mBART is conditioned on the M-CLIP +features by a prefix sequence generated through a lightweight mapping network. +We train this in a two-stage pipeline which warms up the model with image +captioning before the actual translation task. Through experiments, we +demonstrate the merits of this framework and consequently push forward the +state-of-the-art across standard benchmarks by an average of +2.67 BLEU. The +code can be found at www.github.com/devaansh100/CLIPTrans. + +
+
+ comment: 15 pages, 9 figures, to be published In Proceedings of International + Conference of Computer Vision(ICCV), 2023 +
+
+
+
+
+ + ☆ Optron: Better Medical Image Registration via Training in the Loop + + +
+ Previously, in the field of medical image registration, there are primarily +two paradigms, the traditional optimization-based methods, and the +deep-learning-based methods. Each of these paradigms has its advantages, and in +this work, we aim to take the best of both worlds. Instead of developing a new +deep learning model, we designed a robust training architecture that is simple +and generalizable. We present Optron, a general training architecture +incorporating the idea of training-in-the-loop. By iteratively optimizing the +prediction result of a deep learning model through a plug-and-play optimizer +module in the training loop, Optron introduces pseudo ground truth to an +unsupervised training process. And by bringing the training process closer to +that of supervised training, Optron can consistently improve the models' +performance and convergence speed. We evaluated our method on various +combinations of models and datasets, and we have achieved state-of-the-art +performance on the IXI dataset, improving the previous state-of-the-art method +TransMorph by a significant margin of +1.6% DSC. Moreover, Optron also +consistently achieved positive results with other models and datasets. It +increases the validation DSC for VoxelMorph and ViT-V-Net by +2.3% and +2.2% +respectively on IXI, demonstrating our method's generalizability. Our +implementation is publicly available at +https://github.com/miraclefactory/optron + +
+
+ comment: 10 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Is visual explanation with Grad-CAM more reliable for deeper neural + networks? a case study with automatic pneumothorax diagnosis + + +
+ While deep learning techniques have provided the state-of-the-art performance +in various clinical tasks, explainability regarding their decision-making +process can greatly enhance the credence of these methods for safer and quicker +clinical adoption. With high flexibility, Gradient-weighted Class Activation +Mapping (Grad-CAM) has been widely adopted to offer intuitive visual +interpretation of various deep learning models' reasoning processes in +computer-assisted diagnosis. However, despite the popularity of the technique, +there is still a lack of systematic study on Grad-CAM's performance on +different deep learning architectures. In this study, we investigate its +robustness and effectiveness across different popular deep learning models, +with a focus on the impact of the networks' depths and architecture types, by +using a case study of automatic pneumothorax diagnosis in X-ray scans. Our +results show that deeper neural networks do not necessarily contribute to a +strong improvement of pneumothorax diagnosis accuracy, and the effectiveness of +GradCAM also varies among different network architectures. + +
+
+
+
+
+ + ☆ A lightweight 3D dense facial landmark estimation model from position + map data + + +
+ The incorporation of 3D data in facial analysis tasks has gained popularity +in recent years. Though it provides a more accurate and detailed representation +of the human face, accruing 3D face data is more complex and expensive than 2D +face images. Either one has to rely on expensive 3D scanners or depth sensors +which are prone to noise. An alternative option is the reconstruction of 3D +faces from uncalibrated 2D images in an unsupervised way without any ground +truth 3D data. However, such approaches are computationally expensive and the +learned model size is not suitable for mobile or other edge device +applications. Predicting dense 3D landmarks over the whole face can overcome +this issue. As there is no public dataset available containing dense landmarks, +we propose a pipeline to create a dense keypoint training dataset containing +520 key points across the whole face from an existing facial position map data. +We train a lightweight MobileNet-based regressor model with the generated data. +As we do not have access to any evaluation dataset with dense landmarks in it +we evaluate our model against the 68 keypoint detection task. Experimental +results show that our trained model outperforms many of the existing methods in +spite of its lower model size and minimal computational cost. Also, the +qualitative evaluation shows the efficiency of our trained models in extreme +head pose angles as well as other facial variations and occlusions. + +
+
+ comment: 8 pages, The Irish Machine Vision and Image Processing + Conference(IMVIP) +
+
+
+
+
+ + ☆ Uncovering the Unseen: Discover Hidden Intentions by Micro-Behavior + Graph Reasoning + + +
+ This paper introduces a new and challenging Hidden Intention Discovery (HID) +task. Unlike existing intention recognition tasks, which are based on obvious +visual representations to identify common intentions for normal behavior, HID +focuses on discovering hidden intentions when humans try to hide their +intentions for abnormal behavior. HID presents a unique challenge in that +hidden intentions lack the obvious visual representations to distinguish them +from normal intentions. Fortunately, from a sociological and psychological +perspective, we find that the difference between hidden and normal intentions +can be reasoned from multiple micro-behaviors, such as gaze, attention, and +facial expressions. Therefore, we first discover the relationship between +micro-behavior and hidden intentions and use graph structure to reason about +hidden intentions. To facilitate research in the field of HID, we also +constructed a seminal dataset containing a hidden intention annotation of a +typical theft scenario for HID. Extensive experiments show that the proposed +network improves performance on the HID task by 9.9\% over the state-of-the-art +method SBP. + +
+
+
+
+
+ + ☆ A Multimodal Visual Encoding Model Aided by Introducing Verbal Semantic + Information + + +
+ Biological research has revealed that the verbal semantic information in the +brain cortex, as an additional source, participates in nonverbal semantic +tasks, such as visual encoding. However, previous visual encoding models did +not incorporate verbal semantic information, contradicting this biological +finding. This paper proposes a multimodal visual information encoding network +model based on stimulus images and associated textual information in response +to this issue. Our visual information encoding network model takes stimulus +images as input and leverages textual information generated by a text-image +generation model as verbal semantic information. This approach injects new +information into the visual encoding model. Subsequently, a Transformer network +aligns image and text feature information, creating a multimodal feature space. +A convolutional network then maps from this multimodal feature space to voxel +space, constructing the multimodal visual information encoding network model. +Experimental results demonstrate that the proposed multimodal visual +information encoding network model outperforms previous models under the exact +training cost. In voxel prediction of the left hemisphere of subject 1's brain, +the performance improves by approximately 15.87%, while in the right +hemisphere, the performance improves by about 4.6%. The multimodal visual +encoding network model exhibits superior encoding performance. Additionally, +ablation experiments indicate that our proposed model better simulates the +brain's visual information processing. + +
+
+
+
+
+ + ☆ Uncertainty Aware Training to Improve Deep Learning Model Calibration + for Classification of Cardiac MR Images + + +
+ Quantifying uncertainty of predictions has been identified as one way to +develop more trustworthy artificial intelligence (AI) models beyond +conventional reporting of performance metrics. When considering their role in a +clinical decision support setting, AI classification models should ideally +avoid confident wrong predictions and maximise the confidence of correct +predictions. Models that do this are said to be well-calibrated with regard to +confidence. However, relatively little attention has been paid to how to +improve calibration when training these models, i.e., to make the training +strategy uncertainty-aware. In this work we evaluate three novel +uncertainty-aware training strategies comparing against two state-of-the-art +approaches. We analyse performance on two different clinical applications: +cardiac resynchronisation therapy (CRT) response prediction and coronary artery +disease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The +best-performing model in terms of both classification accuracy and the most +common calibration measure, expected calibration error (ECE) was the Confidence +Weight method, a novel approach that weights the loss of samples to explicitly +penalise confident incorrect predictions. The method reduced the ECE by 17% for +CRT response prediction and by 22% for CAD diagnosis when compared to a +baseline classifier in which no uncertainty-aware strategy was included. In +both applications, as well as reducing the ECE there was a slight increase in +accuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD +diagnosis respectively. However, our analysis showed a lack of consistency in +terms of optimal models when using different calibration measures. This +indicates the need for careful consideration of performance metrics when +training and selecting models for complex high-risk applications in healthcare. + +
+
+
+
+
+ + ☆ Abdominal Multi-Organ Segmentation Based on Feature Pyramid Network and + Spatial Recurrent Neural Network + + +
+ As recent advances in AI are causing the decline of conventional diagnostic +methods, the realization of end-to-end diagnosis is fast approaching. +Ultrasound image segmentation is an important step in the diagnostic process. +An accurate and robust segmentation model accelerates the process and reduces +the burden of sonographers. In contrast to previous research, we take two +inherent features of ultrasound images into consideration: (1) different organs +and tissues vary in spatial sizes, (2) the anatomical structures inside human +body form a relatively constant spatial relationship. Based on those two ideas, +we propose a new image segmentation model combining Feature Pyramid Network +(FPN) and Spatial Recurrent Neural Network (SRNN). We discuss why we use FPN to +extract anatomical structures of different scales and how SRNN is implemented +to extract the spatial context features in abdominal ultrasound images. + +
+
+ comment: IFAC World Congress 2023 paper +
+
+
+
+
+ + ☆ CAGRA: Highly Parallel Graph Construction and Approximate Nearest + Neighbor Search for GPUs + + +
+ Approximate Nearest Neighbor Search (ANNS) plays a critical role in various +disciplines spanning data mining and artificial intelligence, from information +retrieval and computer vision to natural language processing and recommender +systems. Data volumes have soared in recent years and the computational cost of +an exhaustive exact nearest neighbor search is often prohibitive, necessitating +the adoption of approximate techniques. The balanced performance and recall of +graph-based approaches have more recently garnered significant attention in +ANNS algorithms, however, only a few studies have explored harnessing the power +of GPUs and multi-core processors despite the widespread use of massively +parallel and general-purpose computing. To bridge this gap, we introduce a +novel parallel computing hardware-based proximity graph and search algorithm. +By leveraging the high-performance capabilities of modern hardware, our +approach achieves remarkable efficiency gains. In particular, our method +surpasses existing CPU and GPU-based methods in constructing the proximity +graph, demonstrating higher throughput in both large- and small-batch searches +while maintaining compatible accuracy. In graph construction time, our method, +CAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA +implementations. In large-batch query throughput in the 90% to 95% recall +range, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the +SOTA implementations for GPU. For a single query, our method is 3.4~53x faster +than HNSW at 95% recall. + +
+
+
+
+
+ + ☆ Evaluation and Analysis of Hallucination in Large Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) have recently achieved remarkable +success. However, LVLMs are still plagued by the hallucination problem, which +limits the practicality in many scenarios. Hallucination refers to the +information of LVLMs' responses that does not exist in the visual input, which +poses potential risks of substantial consequences. There has been limited work +studying hallucination evaluation in LVLMs. In this paper, we propose +Hallucination Evaluation based on Large Language Models (HaELM), an LLM-based +hallucination evaluation framework. HaELM achieves an approximate 95% +performance comparable to ChatGPT and has additional advantages including low +cost, reproducibility, privacy preservation and local deployment. Leveraging +the HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we +analyze the factors contributing to hallucination in LVLMs and offer helpful +suggestions to mitigate the hallucination problem. Our training data and human +annotation hallucination data will be made public soon. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ AI-Based Facial Emotion Recognition Solutions for Education: A Study of + Teacher-User and Other Categories + + +
+ Existing information on AI-based facial emotion recognition (FER) is not +easily comprehensible by those outside the field of computer science, requiring +cross-disciplinary effort to determine a categorisation framework that promotes +the understanding of this technology, and its impact on users. Most proponents +classify FER in terms of methodology, implementation and analysis; relatively +few by its application in education; and none by its users. This paper is +concerned primarily with (potential) teacher-users of FER tools for education. +It proposes a three-part classification of these teachers, by orientation, +condition and preference, based on a classical taxonomy of affective +educational objectives, and related theories. It also compiles and organises +the types of FER solutions found in or inferred from the literature into +"technology" and "applications" categories, as a prerequisite for structuring +the proposed "teacher-user" category. This work has implications for +proponents', critics', and users' understanding of the relationship between +teachers and FER. + +
+
+
+
+
+ + ☆ DiffusionVMR: Diffusion Model for Video Moment Retrieval + + +
+ Video moment retrieval is a fundamental visual-language task that aims to +retrieve target moments from an untrimmed video based on a language query. +Existing methods typically generate numerous proposals manually or via +generative networks in advance as the support set for retrieval, which is not +only inflexible but also time-consuming. Inspired by the success of diffusion +models on object detection, this work aims at reformulating video moment +retrieval as a denoising generation process to get rid of the inflexible and +time-consuming proposal generation. To this end, we propose a novel +proposal-free framework, namely DiffusionVMR, which directly samples random +spans from noise as candidates and introduces denoising learning to ground +target moments. During training, Gaussian noise is added to the real moments, +and the model is trained to learn how to reverse this process. In inference, a +set of time spans is progressively refined from the initial noise to the final +output. Notably, the training and inference of DiffusionVMR are decoupled, and +an arbitrary number of random spans can be used in inference without being +consistent with the training phase. Extensive experiments conducted on three +widely-used benchmarks (i.e., QVHighlight, Charades-STA, and TACoS) demonstrate +the effectiveness of the proposed DiffusionVMR by comparing it with +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Group-Conditional Conformal Prediction via Quantile Regression + Calibration for Crop and Weed Classification + + +
+ As deep learning predictive models become an integral part of a large +spectrum of precision agricultural systems, a barrier to the adoption of such +automated solutions is the lack of user trust in these highly complex, opaque +and uncertain models. Indeed, deep neural networks are not equipped with any +explicit guarantees that can be used to certify the system's performance, +especially in highly varying uncontrolled environments such as the ones +typically faced in computer vision for agriculture.Fortunately, certain methods +developed in other communities can prove to be important for agricultural +applications. This article presents the conformal prediction framework that +provides valid statistical guarantees on the predictive performance of any +black box prediction machine, with almost no assumptions, applied to the +problem of deep visual classification of weeds and crops in real-world +conditions. The framework is exposed with a focus on its practical aspects and +special attention accorded to the Adaptive Prediction Sets (APS) approach that +delivers marginal guarantees on the model's coverage. Marginal results are then +shown to be insufficient to guarantee performance on all groups of individuals +in the population as characterized by their environmental and pedo-climatic +auxiliary data gathered during image acquisition.To tackle this shortcoming, +group-conditional conformal approaches are presented: the ''classical'' method +that consists of iteratively applying the APS procedure on all groups, and a +proposed elegant reformulation and implementation of the procedure using +quantile regression on group membership indicators. Empirical results showing +the validity of the proposed approach are presented and compared to the +marginal APS then discussed. + +
+
+
+
+
+ + ☆ Using deep learning for an automatic detection and classification of the + vascular bifurcations along the Circle of Willis + + +
+ Most of the intracranial aneurysms (ICA) occur on a specific portion of the +cerebral vascular tree named the Circle of Willis (CoW). More particularly, +they mainly arise onto fifteen of the major arterial bifurcations constituting +this circular structure. Hence, for an efficient and timely diagnosis it is +critical to develop some methods being able to accurately recognize each +Bifurcation of Interest (BoI). Indeed, an automatic extraction of the +bifurcations presenting the higher risk of developing an ICA would offer the +neuroradiologists a quick glance at the most alarming areas. Due to the recent +efforts on Artificial Intelligence, Deep Learning turned out to be the best +performing technology for many pattern recognition tasks. Moreover, various +methods have been particularly designed for medical image analysis purposes. +This study intends to assist the neuroradiologists to promptly locate any +bifurcation presenting a high risk of ICA occurrence. It can be seen as a +Computer Aided Diagnosis scheme, where the Artificial Intelligence facilitates +the access to the regions of interest within the MRI. In this work, we propose +a method for a fully automatic detection and recognition of the bifurcations of +interest forming the Circle of Willis. Several neural networks architectures +have been tested, and we thoroughly evaluate the bifurcation recognition rate. + +
+
+
+
+
+ + ☆ Learning to Upsample by Learning to Sample ICCV 2023 + + +
+ We present DySample, an ultra-lightweight and effective dynamic upsampler. +While impressive performance gains have been witnessed from recent kernel-based +dynamic upsamplers such as CARAFE, FADE, and SAPA, they introduce much +workload, mostly due to the time-consuming dynamic convolution and the +additional sub-network used to generate dynamic kernels. Further, the need for +high-res feature guidance of FADE and SAPA somehow limits their application +scenarios. To address these concerns, we bypass dynamic convolution and +formulate upsampling from the perspective of point sampling, which is more +resource-efficient and can be easily implemented with the standard built-in +function in PyTorch. We first showcase a naive design, and then demonstrate how +to strengthen its upsampling behavior step by step towards our new upsampler, +DySample. Compared with former kernel-based dynamic upsamplers, DySample +requires no customized CUDA package and has much fewer parameters, FLOPs, GPU +memory, and latency. Besides the light-weight characteristics, DySample +outperforms other upsamplers across five dense prediction tasks, including +semantic segmentation, object detection, instance segmentation, panoptic +segmentation, and monocular depth estimation. Code is available at +https://github.com/tiny-smart/dysample. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Class Prior-Free Positive-Unlabeled Learning with Taylor Variational + Loss for Hyperspectral Remote Sensing Imagery ICCV 2023 + + +
+ Positive-unlabeled learning (PU learning) in hyperspectral remote sensing +imagery (HSI) is aimed at learning a binary classifier from positive and +unlabeled data, which has broad prospects in various earth vision applications. +However, when PU learning meets limited labeled HSI, the unlabeled data may +dominate the optimization process, which makes the neural networks overfit the +unlabeled data. In this paper, a Taylor variational loss is proposed for HSI PU +learning, which reduces the weight of the gradient of the unlabeled data by +Taylor series expansion to enable the network to find a balance between +overfitting and underfitting. In addition, the self-calibrated optimization +strategy is designed to stabilize the training process. Experiments on 7 +benchmark datasets (21 tasks in total) validate the effectiveness of the +proposed method. Code is at: https://github.com/Hengwei-Zhao96/T-HOneCls. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Exploring Model Transferability through the Lens of Potential Energy ICCV 2023 + + +
+ Transfer learning has become crucial in computer vision tasks due to the vast +availability of pre-trained deep learning models. However, selecting the +optimal pre-trained model from a diverse pool for a specific downstream task +remains a challenge. Existing methods for measuring the transferability of +pre-trained models rely on statistical correlations between encoded static +features and task labels, but they overlook the impact of underlying +representation dynamics during fine-tuning, leading to unreliable results, +especially for self-supervised models. In this paper, we present an insightful +physics-inspired approach named PED to address these challenges. We reframe the +challenge of model selection through the lens of potential energy and directly +model the interaction forces that influence fine-tuning dynamics. By capturing +the motion of dynamic representations to decline the potential energy within a +force-driven physical model, we can acquire an enhanced and more stable +observation for estimating transferability. The experimental results on 10 +downstream tasks and 12 self-supervised models demonstrate that our approach +can seamlessly integrate into existing ranking techniques and enhance their +performances, revealing its effectiveness for the model selection task and its +potential for understanding the mechanism in transfer learning. Code will be +available at https://github.com/lixiaotong97/PED. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior + + +
+ We present DiffBIR, which leverages pretrained text-to-image diffusion models +for blind image restoration problem. Our framework adopts a two-stage pipeline. +In the first stage, we pretrain a restoration module across diversified +degradations to improve generalization capability in real-world scenarios. The +second stage leverages the generative ability of latent diffusion models, to +achieve realistic image restoration. Specifically, we introduce an injective +modulation sub-network -- LAControlNet for finetuning, while the pre-trained +Stable Diffusion is to maintain its generative ability. Finally, we introduce a +controllable module that allows users to balance quality and fidelity by +introducing the latent image guidance in the denoising process during +inference. Extensive experiments have demonstrated its superiority over +state-of-the-art approaches for both blind image super-resolution and blind +face restoration tasks on synthetic and real-world datasets. The code is +available at https://github.com/XPixelGroup/DiffBIR. + +
+
+
+
+
+ + ☆ A Comprehensive Augmentation Framework for Anomaly Detection + + +
+ Data augmentation methods are commonly integrated into the training of +anomaly detection models. Previous approaches have primarily focused on +replicating real-world anomalies or enhancing diversity, without considering +that the standard of anomaly varies across different classes, potentially +leading to a biased training distribution.This paper analyzes crucial traits of +simulated anomalies that contribute to the training of reconstructive networks +and condenses them into several methods, thus creating a comprehensive +framework by selectively utilizing appropriate combinations.Furthermore, we +integrate this framework with a reconstruction-based approach and concurrently +propose a split training strategy that alleviates the issue of overfitting +while avoiding introducing interference to the reconstruction process. The +evaluations conducted on the MVTec anomaly detection dataset demonstrate that +our method outperforms the previous state-of-the-art approach, particularly in +terms of object classes.To evaluate generalizability, we generate a simulated +dataset comprising anomalies with diverse characteristics since the original +test samples only include specific types of anomalies and may lead to biased +evaluations. Experimental results demonstrate that our approach exhibits +promising potential for generalizing effectively to various unforeseen +anomalies encountered in real-world scenarios. + +
+
+
+
+
+ + ☆ Learning Cross-modality Information Bottleneck Representation for + Heterogeneous Person Re-Identification + + +
+ Visible-Infrared person re-identification (VI-ReID) is an important and +challenging task in intelligent video surveillance. Existing methods mainly +focus on learning a shared feature space to reduce the modality discrepancy +between visible and infrared modalities, which still leave two problems +underexplored: information redundancy and modality complementarity. To this +end, properly eliminating the identity-irrelevant information as well as making +up for the modality-specific information are critical and remains a challenging +endeavor. To tackle the above problems, we present a novel mutual information +and modality consensus network, namely CMInfoNet, to extract modality-invariant +identity features with the most representative information and reduce the +redundancies. The key insight of our method is to find an optimal +representation to capture more identity-relevant information and compress the +irrelevant parts by optimizing a mutual information bottleneck trade-off. +Besides, we propose an automatically search strategy to find the most prominent +parts that identify the pedestrians. To eliminate the cross- and intra-modality +variations, we also devise a modality consensus module to align the visible and +infrared modalities for task-specific guidance. Moreover, the global-local +feature representations can also be acquired for key parts discrimination. +Experimental results on four benchmarks, i.e., SYSU-MM01, RegDB, +Occluded-DukeMTMC, Occluded-REID, Partial-REID and Partial\_iLIDS dataset, have +demonstrated the effectiveness of CMInfoNet. + +
+
+
+
+
+ + ☆ AIoT-Based Drum Transcription Robot using Convolutional Neural Networks + + +
+ With the development of information technology, robot technology has made +great progress in various fields. These new technologies enable robots to be +used in industry, agriculture, education and other aspects. In this paper, we +propose a drum robot that can automatically complete music transcription in +real-time, which is based on AIoT and fog computing technology. Specifically, +this drum robot system consists of a cloud node for data storage, edge nodes +for real-time computing, and data-oriented execution application nodes. In +order to analyze drumming music and realize drum transcription, we further +propose a light-weight convolutional neural network model to classify drums, +which can be more effectively deployed in terminal devices for fast edge +calculations. The experimental results show that the proposed system can +achieve more competitive performance and enjoy a variety of smart applications +and services. + +
+
+
+
+
+ + ☆ A Consumer-tier based Visual-Brain Machine Interface for Augmented + Reality Glasses Interactions + + +
+ Objective.Visual-Brain Machine Interface(V-BMI) has provide a novel +interaction technique for Augmented Reality (AR) industries. Several +state-of-arts work has demonstates its high accuracy and real-time interaction +capbilities. However, most of the studies employ EEGs devices that are rigid +and difficult to apply in real-life AR glasseses application sceniraros. Here +we develop a consumer-tier Visual-Brain Machine Inteface(V-BMI) system +specialized for Augmented Reality(AR) glasses interactions. Approach. The +developed system consists of a wearable hardware which takes advantages of fast +set-up, reliable recording and comfortable wearable experience that +specificized for AR glasses applications. Complementing this hardware, we have +devised a software framework that facilitates real-time interactions within the +system while accommodating a modular configuration to enhance scalability. Main +results. The developed hardware is only 110g and 120x85x23 mm, which with 1 +Tohm and peak to peak voltage is less than 1.5 uV, and a V-BMI based angry bird +game and an Internet of Thing (IoT) AR applications are deisgned, we +demonstrated such technology merits of intuitive experience and efficiency +interaction. The real-time interaction accuracy is between 85 and 96 +percentages in a commercial AR glasses (DTI is 2.24s and ITR 65 bits-min ). +Significance. Our study indicates the developed system can provide an essential +hardware-software framework for consumer based V-BMI AR glasses. Also, we +derive several pivotal design factors for a consumer-grade V-BMI-based AR +system: 1) Dynamic adaptation of stimulation patterns-classification methods +via computer vision algorithms is necessary for AR glasses applications; and 2) +Algorithmic localization to foster system stability and latency reduction. + +
+
+ comment: 15 pages,10 figures +
+
+
+
+
+ + ☆ iBARLE: imBalance-Aware Room Layout Estimation + + +
+ Room layout estimation predicts layouts from a single panorama. It requires +datasets with large-scale and diverse room shapes to train the models. However, +there are significant imbalances in real-world datasets including the +dimensions of layout complexity, camera locations, and variation in scene +appearance. These issues considerably influence the model training performance. +In this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE) +framework to address these issues. iBARLE consists of (1) Appearance Variation +Generation (AVG) module, which promotes visual appearance domain +generalization, (2) Complex Structure Mix-up (CSMix) module, which enhances +generalizability w.r.t. room structure, and (3) a gradient-based layout +objective function, which allows more effective accounting for occlusions in +complex layouts. All modules are jointly trained and help each other to achieve +the best performance. Experiments and ablation studies based on +ZInD~\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art +performance compared with other layout estimation baselines. + +
+
+
+
+
+ + ☆ Pose-Free Neural Radiance Fields via Implicit Pose Regularization ICCV2023 + + +
+ Pose-free neural radiance fields (NeRF) aim to train NeRF with unposed +multi-view images and it has achieved very impressive success in recent years. +Most existing works share the pipeline of training a coarse pose estimator with +rendered images at first, followed by a joint optimization of estimated poses +and neural radiance field. However, as the pose estimator is trained with only +rendered images, the pose estimation is usually biased or inaccurate for real +images due to the domain gap between real images and rendered images, leading +to poor robustness for the pose estimation of real images and further local +minima in joint optimization. We design IR-NeRF, an innovative pose-free NeRF +that introduces implicit pose regularization to refine pose estimator with +unposed real images and improve the robustness of the pose estimation for real +images. With a collection of 2D images of a specific scene, IR-NeRF constructs +a scene codebook that stores scene features and captures the scene-specific +pose distribution implicitly as priors. Thus, the robustness of pose estimation +can be promoted with the scene priors according to the rationale that a 2D real +image can be well reconstructed from the scene codebook only when its estimated +pose lies within the pose distribution. Extensive experiments show that IR-NeRF +achieves superior novel view synthesis and outperforms the state-of-the-art +consistently across multiple synthetic and real datasets. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Is it an i or an l: Test-time Adaptation of Text Line Recognition Models + + +
+ Recognizing text lines from images is a challenging problem, especially for +handwritten documents due to large variations in writing styles. While text +line recognition models are generally trained on large corpora of real and +synthetic data, such models can still make frequent mistakes if the handwriting +is inscrutable or the image acquisition process adds corruptions, such as +noise, blur, compression, etc. Writing style is generally quite consistent for +an individual, which can be leveraged to correct mistakes made by such models. +Motivated by this, we introduce the problem of adapting text line recognition +models during test time. We focus on a challenging and realistic setting where, +given only a single test image consisting of multiple text lines, the task is +to adapt the model such that it performs better on the image, without any +labels. We propose an iterative self-training approach that uses feedback from +the language model to update the optical model, with confident self-labels in +each iteration. The confidence measure is based on an augmentation mechanism +that evaluates the divergence of the prediction of the model in a local region. +We perform rigorous evaluation of our method on several benchmark datasets as +well as their corrupted versions. Experimental results on multiple datasets +spanning multiple scripts show that the proposed adaptation method offers an +absolute improvement of up to 8% in character error rate with just a few +iterations of self-training at test time. + +
+
+
+
+
+ + ☆ Pyramid diffractive optical networks for unidirectional magnification + and demagnification + + +
+ Diffractive deep neural networks (D2NNs) are composed of successive +transmissive layers optimized using supervised deep learning to all-optically +implement various computational tasks between an input and output field-of-view +(FOV). Here, we present a pyramid-structured diffractive optical network design +(which we term P-D2NN), optimized specifically for unidirectional image +magnification and demagnification. In this P-D2NN design, the diffractive +layers are pyramidally scaled in alignment with the direction of the image +magnification or demagnification. Our analyses revealed the efficacy of this +P-D2NN design in unidirectional image magnification and demagnification tasks, +producing high-fidelity magnified or demagnified images in only one direction, +while inhibiting the image formation in the opposite direction - confirming the +desired unidirectional imaging operation. Compared to the conventional D2NN +designs with uniform-sized successive diffractive layers, P-D2NN design +achieves similar performance in unidirectional magnification tasks using only +half of the diffractive degrees of freedom within the optical processor volume. +Furthermore, it maintains its unidirectional image +magnification/demagnification functionality across a large band of illumination +wavelengths despite being trained with a single illumination wavelength. With +this pyramidal architecture, we also designed a wavelength-multiplexed +diffractive network, where a unidirectional magnifier and a unidirectional +demagnifier operate simultaneously in opposite directions, at two distinct +illumination wavelengths. The efficacy of the P-D2NN architecture was also +validated experimentally using monochromatic terahertz illumination, +successfully matching our numerical simulations. P-D2NN offers a +physics-inspired strategy for designing task-specific visual processors. + +
+
+ comment: 26 Pages, 7 Figures +
+
+
+
+
+ + ☆ C2G2: Controllable Co-speech Gesture Generation with Latent Diffusion + Model + + +
+ Co-speech gesture generation is crucial for automatic digital avatar +animation. However, existing methods suffer from issues such as unstable +training and temporal inconsistency, particularly in generating high-fidelity +and comprehensive gestures. Additionally, these methods lack effective control +over speaker identity and temporal editing of the generated gestures. Focusing +on capturing temporal latent information and applying practical controlling, we +propose a Controllable Co-speech Gesture Generation framework, named C2G2. +Specifically, we propose a two-stage temporal dependency enhancement strategy +motivated by latent diffusion models. We further introduce two key features to +C2G2, namely a speaker-specific decoder to generate speaker-related real-length +skeletons and a repainting strategy for flexible gesture generation/editing. +Extensive experiments on benchmark gesture datasets verify the effectiveness of +our proposed C2G2 compared with several state-of-the-art baselines. The link of +the project demo page can be found at https://c2g2-gesture.github.io/c2_gesture + +
+
+ comment: 12 pages, 6 figures, 7 tables +
+
+
+
+
+ + ☆ Few-Shot Object Detection via Synthetic Features with Optimal Transport + + +
+ Few-shot object detection aims to simultaneously localize and classify the +objects in an image with limited training samples. However, most existing +few-shot object detection methods focus on extracting the features of a few +samples of novel classes that lack diversity. Hence, they may not be sufficient +to capture the data distribution. To address that limitation, in this paper, we +propose a novel approach in which we train a generator to generate synthetic +data for novel classes. Still, directly training a generator on the novel class +is not effective due to the lack of novel data. To overcome that issue, we +leverage the large-scale dataset of base classes. Our overarching goal is to +train a generator that captures the data variations of the base dataset. We +then transform the captured variations into novel classes by generating +synthetic data with the trained generator. To encourage the generator to +capture data variations on base classes, we propose to train the generator with +an optimal transport loss that minimizes the optimal transport distance between +the distributions of real and synthetic data. Extensive experiments on two +benchmark datasets demonstrate that the proposed method outperforms the state +of the art. Source code will be available. + +
+
+
+
+
+ + ☆ PBFormer: Capturing Complex Scene Text Shape with Polynomial Band + Transformer ACM MM 2023 + + +
+ We present PBFormer, an efficient yet powerful scene text detector that +unifies the transformer with a novel text shape representation Polynomial Band +(PB). The representation has four polynomial curves to fit a text's top, +bottom, left, and right sides, which can capture a text with a complex shape by +varying polynomial coefficients. PB has appealing features compared with +conventional representations: 1) It can model different curvatures with a fixed +number of parameters, while polygon-points-based methods need to utilize a +different number of points. 2) It can distinguish adjacent or overlapping texts +as they have apparent different curve coefficients, while segmentation-based or +points-based methods suffer from adhesive spatial positions. PBFormer combines +the PB with the transformer, which can directly generate smooth text contours +sampled from predicted curves without interpolation. A parameter-free +cross-scale pixel attention (CPA) module is employed to highlight the feature +map of a suitable scale while suppressing the other feature maps. The simple +operation can help detect small-scale texts and is compatible with the +one-stage DETR framework, where no postprocessing exists for NMS. Furthermore, +PBFormer is trained with a shape-contained loss, which not only enforces the +piecewise alignment between the ground truth and the predicted curves but also +makes curves' positions and shapes consistent with each other. Without bells +and whistles about text pre-training, our method is superior to the previous +state-of-the-art text detectors on the arbitrary-shaped text datasets. + +
+
+ comment: 9 pages, 8 figures, accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ WSAM: Visual Explanations from Style Augmentation as Adversarial + Attacker and Their Influence in Image Classification + + +
+ Currently, style augmentation is capturing attention due to convolutional +neural networks (CNN) being strongly biased toward recognizing textures rather +than shapes. Most existing styling methods either perform a low-fidelity style +transfer or a weak style representation in the embedding vector. This paper +outlines a style augmentation algorithm using stochastic-based sampling with +noise addition to improving randomization on a general linear transformation +for style transfer. With our augmentation strategy, all models not only present +incredible robustness against image stylizing but also outperform all previous +methods and surpass the state-of-the-art performance for the STL-10 dataset. In +addition, we present an analysis of the model interpretations under different +style variations. At the same time, we compare comprehensive experiments +demonstrating the performance when applied to deep neural architectures in +training settings. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ ICARUS: An Android-Based Unmanned Aerial Vehicle (UAV) Search and Rescue + Eye in the Sky + + +
+ The purpose of this paper is to develop an unmanned aerial vehicle (UAV) +using a quadcopter with the capability of video surveillance, map coordinates, +a deployable parachute with a medicine kit or a food pack as a payload, a +collision warning system, remotely controlled, integrated with an android +application to assist in search and rescue operations. + Applied research for the development of the functional prototype, +quantitative and descriptive statistics to summarize data by describing the +relationship between variables in a sample or population. The quadcopter +underwent an evaluation using a survey instrument to test its acceptability +using predefined variables to select respondents within Caloocan City and +Quezon City, Philippines. + Demographic profiles and known issues and concerns were answered by 30 +respondents. The results were summarized and distributed in Tables 1 and 2. + In terms of demographic profiles, the number of SAR operators within the +specified areas is distributed equally, most are male, single, and within the +age bracket of 31 and above. In issues and concerns, the most common type of +search and rescue was ground search and rescue. Human error is the primary +cause of most injuries in operating units. The prototype was useful and +everyone agreed, in terms of acceptability, drone technology will improve +search and rescue operations. + The innovative way of utilizing Android and drone technology is a new step +towards the improvement of SAR operations in the Philippines. + The LiPo battery must be replaced with a higher capacity and the drone +operator should undergo a training course and secure a permit from the Civil +Aviation Authority of the Philippines (CAAP). + +
+
+ comment: 15 pages, 14 figures, Special Issue: IRCCETE 2023 +
+
+
+
+
+ + ☆ Vision Grid Transformer for Document Layout Analysis ICCV2023 + + +
+ Document pre-trained models and grid-based models have proven to be very +effective on various tasks in Document AI. However, for the document layout +analysis (DLA) task, existing document pre-trained models, even those +pre-trained in a multi-modal fashion, usually rely on either textual features +or visual features. Grid-based models for DLA are multi-modality but largely +neglect the effect of pre-training. To fully leverage multi-modal information +and exploit pre-training techniques to learn better representation for DLA, in +this paper, we present VGT, a two-stream Vision Grid Transformer, in which Grid +Transformer (GiT) is proposed and pre-trained for 2D token-level and +segment-level semantic understanding. Furthermore, a new dataset named D$^4$LA, +which is so far the most diverse and detailed manually-annotated benchmark for +document layout analysis, is curated and released. Experiment results have +illustrated that the proposed VGT model achieves new state-of-the-art results +on DLA tasks, e.g. PubLayNet ($95.7\%$$\rightarrow$$96.2\%$), DocBank +($79.6\%$$\rightarrow$$84.1\%$), and D$^4$LA ($67.7\%$$\rightarrow$$68.8\%$). +The code and models as well as the D$^4$LA dataset will be made publicly +available ~\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Reprogramming under constraints: Revisiting efficient and reliable + transferability of lottery tickets + + +
+ In the era of foundation models with huge pre-training budgets, the +downstream tasks have been shifted to the narrative of efficient and fast +adaptation. For classification-based tasks in the domain of computer vision, +the two most efficient approaches have been linear probing (LP) and visual +prompting/reprogramming (VP); the former aims to learn a classifier in the form +of a linear head on the features extracted by the pre-trained model, while the +latter maps the input data to the domain of the source data on which the model +was originally pre-trained on. Although extensive studies have demonstrated the +differences between LP and VP in terms of downstream performance, we explore +the capabilities of the two aforementioned methods via the sparsity axis: (a) +Data sparsity: the impact of few-shot adaptation and (b) Model sparsity: the +impact of lottery tickets (LT). We demonstrate that LT are not universal +reprogrammers, i.e., for certain target datasets, reprogramming an LT yields +significantly lower performance than the reprogrammed dense model although +their corresponding upstream performance is similar. Further, we demonstrate +that the calibration of dense models is always superior to that of their +lottery ticket counterparts under both LP and VP regimes. Our empirical study +opens a new avenue of research into VP for sparse models and encourages further +understanding of the performance beyond the accuracy achieved by VP under +constraints of sparsity. Code and logs can be accessed at +\url{https://github.com/landskape-ai/Reprogram_LT}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ CEFHRI: A Communication Efficient Federated Learning Framework for + Recognizing Industrial Human-Robot Interaction IROS 2023 + + +
+ Human-robot interaction (HRI) is a rapidly growing field that encompasses +social and industrial applications. Machine learning plays a vital role in +industrial HRI by enhancing the adaptability and autonomy of robots in complex +environments. However, data privacy is a crucial concern in the interaction +between humans and robots, as companies need to protect sensitive data while +machine learning algorithms require access to large datasets. Federated +Learning (FL) offers a solution by enabling the distributed training of models +without sharing raw data. Despite extensive research on Federated learning (FL) +for tasks such as natural language processing (NLP) and image classification, +the question of how to use FL for HRI remains an open research problem. The +traditional FL approach involves transmitting large neural network parameter +matrices between the server and clients, which can lead to high communication +costs and often becomes a bottleneck in FL. This paper proposes a +communication-efficient FL framework for human-robot interaction (CEFHRI) to +address the challenges of data heterogeneity and communication costs. The +framework leverages pre-trained models and introduces a trainable +spatiotemporal adapter for video understanding tasks in HRI. Experimental +results on three human-robot interaction benchmark datasets: HRI30, InHARD, and +COIN demonstrate the superiority of CEFHRI over full fine-tuning in terms of +communication costs. The proposed methodology provides a secure and efficient +approach to HRI federated learning, particularly in industrial environments +with data privacy concerns and limited communication bandwidth. Our code is +available at +https://github.com/umarkhalidAI/CEFHRI-Efficient-Federated-Learning. + +
+
+ comment: Accepted in IROS 2023 +
+
+
+
+
+ + ☆ Read-only Prompt Optimization for Vision-Language Few-shot Learning ICCV2023 + + +
+ In recent years, prompt tuning has proven effective in adapting pre-trained +vision-language models to downstream tasks. These methods aim to adapt the +pre-trained models by introducing learnable prompts while keeping pre-trained +weights frozen. However, learnable prompts can affect the internal +representation within the self-attention module, which may negatively impact +performance variance and generalization, especially in data-deficient settings. +To address these issues, we propose a novel approach, Read-only Prompt +Optimization (RPO). RPO leverages masked attention to prevent the internal +representation shift in the pre-trained model. Further, to facilitate the +optimization of RPO, the read-only prompts are initialized based on special +tokens of the pre-trained model. Our extensive experiments demonstrate that RPO +outperforms CLIP and CoCoOp in base-to-new generalization and domain +generalization while displaying better robustness. Also, the proposed method +achieves better generalization on extremely data-deficient settings, while +improving parameter efficiency and computational overhead. Code is available at +https://github.com/mlvlab/RPO. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ☆ Multimodal Foundation Models For Echocardiogram Interpretation + + +
+ Multimodal deep learning foundation models can learn the relationship between +images and text. In the context of medical imaging, mapping images to language +concepts reflects the clinical task of diagnostic image interpretation, however +current general-purpose foundation models do not perform well in this context +because their training corpus have limited medical text and images. To address +this challenge and account for the range of cardiac physiology, we leverage +1,032,975 cardiac ultrasound videos and corresponding expert interpretations to +develop EchoCLIP, a multimodal foundation model for echocardiography. EchoCLIP +displays strong zero-shot (not explicitly trained) performance in cardiac +function assessment (external validation left ventricular ejection fraction +mean absolute error (MAE) of 7.1%) and identification of implanted intracardiac +devices (areas under the curve (AUC) between 0.84 and 0.98 for pacemakers and +artificial heart valves). We also developed a long-context variant (EchoCLIP-R) +with a custom echocardiography report text tokenizer which can accurately +identify unique patients across multiple videos (AUC of 0.86), identify +clinical changes such as orthotopic heart transplants (AUC of 0.79) or cardiac +surgery (AUC 0.77), and enable robust image-to-text search (mean cross-modal +retrieval rank in the top 1% of candidate text reports). These emergent +capabilities can be used for preliminary assessment and summarization of +echocardiographic findings. + +
+
+
+
+
+ + ☆ Bridging Distribution Learning and Image Clustering in High-dimensional + Space + + +
+ Distribution learning focuses on learning the probability density function +from a set of data samples. In contrast, clustering aims to group similar +objects together in an unsupervised manner. Usually, these two tasks are +considered unrelated. However, the relationship between the two may be +indirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge. +In this paper, we focus on exploring the correlation between distribution +learning and clustering, with the motivation to fill the gap between these two +fields, utilizing an autoencoder (AE) to encode images into a high-dimensional +latent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler +(KL) divergence loss are used to fit the Gaussian components of the GMM and +learn the data distribution. Finally, image clustering is achieved through each +Gaussian component of GMM. Yet, the "curse of dimensionality" poses severe +challenges for most clustering algorithms. Compared with the classic +Expectation-Maximization (EM) Algorithm, experimental results show that MCMarg +and KL divergence can greatly alleviate the difficulty. Based on the +experimental results, we believe distribution learning can exploit the +potential of GMM in image clustering within high-dimensional space. + +
+
+
+
+
+ + ☆ Unveiling Camouflage: A Learnable Fourier-based Augmentation for + Camouflaged Object Detection and Instance Segmentation + + +
+ Camouflaged object detection (COD) and camouflaged instance segmentation +(CIS) aim to recognize and segment objects that are blended into their +surroundings, respectively. While several deep neural network models have been +proposed to tackle those tasks, augmentation methods for COD and CIS have not +been thoroughly explored. Augmentation strategies can help improve the +performance of models by increasing the size and diversity of the training data +and exposing the model to a wider range of variations in the data. Besides, we +aim to automatically learn transformations that help to reveal the underlying +structure of camouflaged objects and allow the model to learn to better +identify and segment camouflaged objects. To achieve this, we propose a +learnable augmentation method in the frequency domain for COD and CIS via +Fourier transform approach, dubbed CamoFourier. Our method leverages a +conditional generative adversarial network and cross-attention mechanism to +generate a reference image and an adaptive hybrid swapping with parameters to +mix the low-frequency component of the reference image and the high-frequency +component of the input image. This approach aims to make camouflaged objects +more visible for detection and segmentation models. Without bells and whistles, +our proposed augmentation method boosts the performance of camouflaged object +detectors and camouflaged instance segmenters by large margins. + +
+
+
+
+
+ + ☆ Detection of Mild Cognitive Impairment Using Facial Features in Video + Conversations + + +
+ Early detection of Mild Cognitive Impairment (MCI) leads to early +interventions to slow the progression from MCI into dementia. Deep Learning +(DL) algorithms could help achieve early non-invasive, low-cost detection of +MCI. This paper presents the detection of MCI in older adults using DL models +based only on facial features extracted from video-recorded conversations at +home. We used the data collected from the I-CONECT behavioral intervention +study (NCT02871921), where several sessions of semi-structured interviews +between socially isolated older individuals and interviewers were video +recorded. We develop a framework that extracts spatial holistic facial features +using a convolutional autoencoder and temporal information using transformers. +Our proposed DL model was able to detect the I-CONECT study participants' +cognitive conditions (MCI vs. those with normal cognition (NC)) using facial +features. The segments and sequence information of the facial features improved +the prediction performance compared with the non-temporal features. The +detection accuracy using this combined method reached 88% whereas 84% is the +accuracy without applying the segments and sequences information of the facial +features within a video on a certain theme. + +
+
+
+
+
+ + ☆ RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware + Contextual Reasoning on Whole Slide Images AAAI + + +
+ Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer +in the US. It is diagnosed by manual multi-class tumor grading using a tissue +whole slide image (WSI), which is subjective and suffers from inter-pathologist +variability. We propose an automated weakly-supervised grading approach for +cSCC WSIs that is trained using WSI-level grade and does not require +fine-grained tumor annotations. The proposed model, RACR-MIL, transforms each +WSI into a bag of tiled patches and leverages attention-based multiple-instance +learning to assign a WSI-level grade. We propose three key innovations to +address general as well as cSCC-specific challenges in tumor grading. First, we +leverage spatial and semantic proximity to define a WSI graph that encodes both +local and non-local dependencies between tumor regions and leverage graph +attention convolution to derive contextual patch features. Second, we introduce +a novel ordinal ranking constraint on the patch attention network to ensure +that higher-grade tumor regions are assigned higher attention. Third, we use +tumor depth as an auxiliary task to improve grade classification in a multitask +learning framework. RACR-MIL achieves 2-9% improvement in grade classification +over existing weakly-supervised approaches on a dataset of 718 cSCC tissue +images and localizes the tumor better. The model achieves 5-20% higher accuracy +in difficult-to-classify high-risk grade classes and is robust to class +imbalance. + +
+
+ comment: 7 pages main text, 2 page references, 3 page appendix; submitted to + AAAI +
+
+
+
+
+ + ☆ Prototype Fission: Closing Set for Robust Open-set Semi-supervised + Learning + + +
+ Semi-supervised Learning (SSL) has been proven vulnerable to +out-of-distribution (OOD) samples in realistic large-scale unsupervised +datasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A +key underlying problem is class-wise latent space spreading from closed seen +space to open unseen space, and the bias is further magnified in SSL's +self-training loops. To close the ID distribution set so that OODs are better +rejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise +latent spaces into compact sub-spaces by automatic fine-grained latent space +mining, driven by coarse-grained labels only. Specifically, we form multiple +unique learnable sub-class prototypes for each class, optimized towards both +diversity and consistency. The Diversity Modeling term encourages samples to be +clustered by one of the multiple sub-class prototypes, while the Consistency +Modeling term clusters all samples of the same class to a global prototype. +Instead of "opening set", i.e., modeling OOD distribution, Prototype Fission +"closes set" and makes it hard for OOD samples to fit in sub-class latent +space. Therefore, PF is compatible with existing methods for further +performance gains. Extensive experiments validate the effectiveness of our +method in open-set SSL settings in terms of successfully forming sub-classes, +discriminating OODs from IDs and improving overall accuracy. Codes will be +released. + +
+
+
+
+
+ + ☆ Learning Sequential Information in Task-based fMRI for Synthetic Data + Augmentation MICCAI + + +
+ Insufficiency of training data is a persistent issue in medical image +analysis, especially for task-based functional magnetic resonance images (fMRI) +with spatio-temporal imaging data acquired using specific cognitive tasks. In +this paper, we propose an approach for generating synthetic fMRI sequences that +can then be used to create augmented training datasets in downstream learning +tasks. To synthesize high-resolution task-specific fMRI, we adapt the +$\alpha$-GAN structure, leveraging advantages of both GAN and variational +autoencoder models, and propose different alternatives in aggregating temporal +information. The synthetic images are evaluated from multiple perspectives +including visualizations and an autism spectrum disorder (ASD) classification +task. The results show that the synthetic task-based fMRI can provide effective +data augmentation in learning the ASD classification task. + +
+
+ comment: Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI + workshop), preprint version +
+
+
+
+
+ + ☆ A Pseudo-Boolean Polynomials Approach for Image Edge Detection + + +
+ We introduce a novel approach for image edge detection based on +pseudo-Boolean polynomials for image patches. We show that patches covering +edge regions in the image result in pseudo-Boolean polynomials with higher +degrees compared to patches that cover blob regions. The proposed approach is +based on reduction of polynomial degree and equivalence properties of +penalty-based pseudo-Boolean polynomials. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient Ray Sampling for Radiance Fields Reconstruction + + +
+ Accelerating neural radiance fields training is of substantial practical +value, as the ray sampling strategy profoundly impacts network convergence. +More efficient ray sampling can thus directly enhance existing NeRF models' +training efficiency. We therefore propose a novel ray sampling approach for +neural radiance fields that improves training efficiency while retaining +photorealistic rendering results. First, we analyze the relationship between +the pixel loss distribution of sampled rays and rendering quality. This reveals +redundancy in the original NeRF's uniform ray sampling. Guided by this finding, +we develop a sampling method leveraging pixel regions and depth boundaries. Our +main idea is to sample fewer rays in training views, yet with each ray more +informative for scene fitting. Sampling probability increases in pixel areas +exhibiting significant color and depth variation, greatly reducing wasteful +rays from other regions without sacrificing precision. Through this method, not +only can the convergence of the network be accelerated, but the spatial +geometry of a scene can also be perceived more accurately. Rendering outputs +are enhanced, especially for texture-complex regions. Experiments demonstrate +that our method significantly outperforms state-of-the-art techniques on public +benchmark datasets. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ DebSDF: Delving into the Details and Bias of Neural Indoor Scene + Reconstruction + + +
+ In recent years, the neural implicit surface has emerged as a powerful +representation for multi-view surface reconstruction due to its simplicity and +state-of-the-art performance. However, reconstructing smooth and detailed +surfaces in indoor scenes from multi-view images presents unique challenges. +Indoor scenes typically contain large texture-less regions, making the +photometric loss unreliable for optimizing the implicit surface. Previous work +utilizes monocular geometry priors to improve the reconstruction in indoor +scenes. However, monocular priors often contain substantial errors in thin +structure regions due to domain gaps and the inherent inconsistencies when +derived independently from different views. This paper presents \textbf{DebSDF} +to address these challenges, focusing on the utilization of uncertainty in +monocular priors and the bias in SDF-based volume rendering. We propose an +uncertainty modeling technique that associates larger uncertainties with larger +errors in the monocular priors. High-uncertainty priors are then excluded from +optimization to prevent bias. This uncertainty measure also informs an +importance-guided ray sampling and adaptive smoothness regularization, +enhancing the learning of fine structures. We further introduce a bias-aware +signed distance function to density transformation that takes into account the +curvature and the angle between the view direction and the SDF normals to +reconstruct fine details better. Our approach has been validated through +extensive experiments on several challenging datasets, demonstrating improved +qualitative and quantitative results in reconstructing thin structures in +indoor scenes, thereby outperforming previous work. + +
+
+
+
+
+ + ☆ Document AI: A Comparative Study of Transformer-Based, Graph-Based + Models, and Convolutional Neural Networks For Document Layout Analysis + + +
+ Document AI aims to automatically analyze documents by leveraging natural +language processing and computer vision techniques. One of the major tasks of +Document AI is document layout analysis, which structures document pages by +interpreting the content and spatial relationships of layout, image, and text. +This task can be image-centric, wherein the aim is to identify and label +various regions such as authors and paragraphs, or text-centric, where the +focus is on classifying individual words in a document. Although there are +increasingly sophisticated methods for improving layout analysis, doubts remain +about the extent to which their findings can be generalized to a broader +context. Specifically, prior work developed systems based on very different +architectures, such as transformer-based, graph-based, and CNNs. However, no +work has mentioned the effectiveness of these models in a comparative analysis. +Moreover, while language-independent Document AI models capable of knowledge +transfer have been developed, it remains to be investigated to what degree they +can effectively transfer knowledge. In this study, we aim to fill these gaps by +conducting a comparative evaluation of state-of-the-art models in document +layout analysis and investigating the potential of cross-lingual layout +analysis by utilizing machine translation techniques. + +
+
+
+
+
+ + ☆ Shatter and Gather: Learning Referring Image Segmentation with Text + Supervision ICCV 2023 + + +
+ Referring image segmentation, the task of segmenting any arbitrary entities +described in free-form texts, opens up a variety of vision applications. +However, manual labeling of training data for this task is prohibitively +costly, leading to lack of labeled data for training. We address this issue by +a weakly supervised learning approach using text descriptions of training +images as the only source of supervision. To this end, we first present a new +model that discovers semantic entities in input image and then combines such +entities relevant to text query to predict the mask of the referent. We also +present a new loss function that allows the model to be trained without any +further supervision. Our method was evaluated on four public benchmarks for +referring image segmentation, where it clearly outperformed the existing method +for the same task and recent open-vocabulary segmentation models on all the +benchmarks. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Fairness-aware Vision Transformer via Debiased Self-Attention + + +
+ Vision Transformer (ViT) has recently gained significant interest in solving +computer vision (CV) problems due to its capability of extracting informative +features and modeling long-range dependencies through the self-attention +mechanism. To fully realize the advantages of ViT in real-world applications, +recent works have explored the trustworthiness of ViT, including its robustness +and explainability. However, another desiderata, fairness has not yet been +adequately addressed in the literature. We establish that the existing +fairness-aware algorithms (primarily designed for CNNs) do not perform well on +ViT. This necessitates the need for developing our novel framework via Debiased +Self-Attention (DSA). DSA is a fairness-through-blindness approach that +enforces ViT to eliminate spurious features correlated with the sensitive +attributes for bias mitigation. Notably, adversarial examples are leveraged to +locate and mask the spurious features in the input image patches. In addition, +DSA utilizes an attention weights alignment regularizer in the training +objective to encourage learning informative features for target prediction. +Importantly, our DSA framework leads to improved fairness guarantees over prior +works on multiple prediction tasks without compromising target prediction +performance. + +
+
+
+
+
+ + ♻ ☆ An Empirical Investigation of the Role of Pre-training in Lifelong + Learning + + +
+ The lifelong learning paradigm in machine learning is an attractive +alternative to the more prominent isolated learning scheme not only due to its +resemblance to biological learning but also its potential to reduce energy +waste by obviating excessive model re-training. A key challenge to this +paradigm is the phenomenon of catastrophic forgetting. With the increasing +popularity and success of pre-trained models in machine learning, we pose the +question: What role does pre-training play in lifelong learning, specifically +with respect to catastrophic forgetting? We investigate existing methods in the +context of large, pre-trained models and evaluate their performance on a +variety of text and image classification tasks, including a large-scale study +using a novel data set of 15 diverse NLP tasks. Across all settings, we observe +that generic pre-training implicitly alleviates the effects of catastrophic +forgetting when learning multiple tasks sequentially compared to randomly +initialized models. We then further investigate why pre-training alleviates +forgetting in this setting. We study this phenomenon by analyzing the loss +landscape, finding that pre-trained weights appear to ease forgetting by +leading to wider minima. Based on this insight, we propose jointly optimizing +for current task loss and loss basin sharpness to explicitly encourage wider +basins during sequential fine-tuning. We show that this optimization approach +outperforms several state-of-the-art task-sequential continual learning +algorithms across multiple settings, occasionally even without retaining a +memory that scales in size with the number of tasks. + +
+
+
+
+
+ + ♻ ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ♻ ☆ Learning Content-enhanced Mask Transformer for Domain Generalized + Urban-Scene Segmentation + + +
+ Domain-generalized urban-scene semantic segmentation (USSS) aims to learn +generalized semantic predictions across diverse urban-scene styles. Unlike +domain gap challenges, USSS is unique in that the semantic categories are often +similar in different urban scenes, while the styles can vary significantly due +to changes in urban landscapes, weather conditions, lighting, and other +factors. Existing approaches typically rely on convolutional neural networks +(CNNs) to learn the content of urban scenes. + In this paper, we propose a Content-enhanced Mask TransFormer (CMFormer) for +domain-generalized USSS. The main idea is to enhance the focus of the +fundamental component, the mask attention mechanism, in Transformer +segmentation models on content information. To achieve this, we introduce a +novel content-enhanced mask attention mechanism. It learns mask queries from +both the image feature and its down-sampled counterpart, as lower-resolution +image features usually contain more robust content information and are less +sensitive to style variations. These features are fused into a Transformer +decoder and integrated into a multi-resolution content-enhanced mask attention +learning scheme. + Extensive experiments conducted on various domain-generalized urban-scene +segmentation datasets demonstrate that the proposed CMFormer significantly +outperforms existing CNN-based methods for domain-generalized semantic +segmentation, achieving improvements of up to 14.00\% in terms of mIoU (mean +intersection over union). The source code for CMFormer will be made available +at this +\href{https://github.com/BiQiWHU/domain-generalized-urban-scene-segmentation}{repository}. + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Exploring the Relationship between Samples and Masks for Robust Defect + Localization + + +
+ Defect detection aims to detect and localize regions out of the normal +distribution.Previous approaches model normality and compare it with the input +to identify defective regions, potentially limiting their generalizability.This +paper proposes a one-stage framework that detects defective patterns directly +without the modeling process.This ability is adopted through the joint efforts +of three parties: a generative adversarial network (GAN), a newly proposed +scaled pattern loss, and a dynamic masked cycle-consistent auxiliary network. +Explicit information that could indicate the position of defects is +intentionally excluded to avoid learning any direct mapping.Experimental +results on the texture class of the challenging MVTec AD dataset show that the +proposed method is 2.9\% higher than the SOTA methods in F1-Score, while +substantially outperforming SOTA methods in generalizability. + +
+
+
+
+
+ + ♻ ☆ Few-shot $\mathbf{1/a}$ Anomalies Feedback : Damage Vision Mining + Opportunity and Embedding Feature Imbalance + + +
+ Over the past decade, previous balanced datasets have been used to advance +deep learning algorithms for industrial applications. In urban infrastructures +and living environments, damage data mining cannot avoid imbalanced data issues +because of rare unseen events and the high-quality status of improved +operations. For visual inspection, the deteriorated class acquired from the +surface of concrete and steel components are occasionally imbalanced. From +numerous related surveys, we conclude that imbalanced data problems can be +categorised into four types: 1) missing range of target and label valuables, 2) +majority-minority class imbalance, 3) foreground background of spatial +imbalance, and 4) long-tailed class of pixel-wise imbalance. Since 2015, many +imbalanced studies have been conducted using deep-learning approaches, +including regression, image classification, object detection, and semantic +segmentation. However, anomaly detection for imbalanced data is not well known. +In this study, we highlight a one-class anomaly detection application, whether +anomalous class or not, and demonstrate clear examples of imbalanced vision +datasets: medical disease, hazardous behaviour, material deterioration, plant +disease, river sludge, and disaster damage. We provide key results on the +advantage of damage-vision mining, hypothesising that the more effective the +range of the positive ratio, the higher the accuracy gain of the anomalies +feedback. In our imbalanced studies, compared with the balanced case with a +positive ratio of $1/1$, we find that there is an applicable positive ratio +$1/a$ where the accuracy is consistently high. However, the extremely +imbalanced range is from one shot to $1/2a$, the accuracy of which is inferior +to that of the applicable ratio. In contrast, with a positive ratio ranging +over $2/a$, it shifts in the over-mining phase without an effective gain in +accuracy. + +
+
+ comment: 34 pages, 53 figures, 28 tables +
+
+
+
+
+ + ♻ ☆ Reliable Multimodality Eye Disease Screening via Mixture of Student's t + Distributions MICCAI 2023 + + +
+ Multimodality eye disease screening is crucial in ophthalmology as it +integrates information from diverse sources to complement their respective +performances. However, the existing methods are weak in assessing the +reliability of each unimodality, and directly fusing an unreliable modality may +cause screening errors. To address this issue, we introduce a novel +multimodality evidential fusion pipeline for eye disease screening, EyeMoSt, +which provides a measure of confidence for unimodality and elegantly integrates +the multimodality information from a multi-distribution fusion perspective. +Specifically, our model estimates both local uncertainty for unimodality and +global uncertainty for the fusion modality to produce reliable classification +results. More importantly, the proposed mixture of Student's $t$ distributions +adaptively integrates different modalities to endow the model with heavy-tailed +properties, increasing robustness and reliability. Our experimental findings on +both public and in-house datasets show that our model is more reliable than +current methods. Additionally, EyeMost has the potential ability to serve as a +data quality discriminator, enabling reliable decision-making for multimodality +eye disease screening. + +
+
+ comment: MICCAI 2023 (Early accept):11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly + Identification + + +
+ Failure to recognize samples from the classes unseen during training is a +major limitation of artificial intelligence in the real-world implementation +for recognition and classification of retinal anomalies. We established an +uncertainty-inspired open-set (UIOS) model, which was trained with fundus +images of 9 retinal conditions. Besides assessing the probability of each +category, UIOS also calculated an uncertainty score to express its confidence. +Our UIOS model with thresholding strategy achieved an F1 score of 99.55%, +97.01% and 91.91% for the internal testing set, external target categories +(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1 +score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS +correctly predicted high uncertainty scores, which would prompt the need for a +manual check in the datasets of non-target categories retinal diseases, +low-quality fundus images, and non-fundus images. UIOS provides a robust method +for real-world screening of retinal anomalies. + +
+
+
+
+
+ + ♻ ☆ TALL: Thumbnail Layout for Deepfake Video Detection ICCV 2023 + + +
+ The growing threats of deepfakes to society and cybersecurity have raised +enormous public concerns, and increasing efforts have been devoted to this +critical topic of deepfake video detection. Existing video methods achieve good +performance but are computationally intensive. This paper introduces a simple +yet effective strategy named Thumbnail Layout (TALL), which transforms a video +clip into a pre-defined layout to realize the preservation of spatial and +temporal dependencies. Specifically, consecutive frames are masked in a fixed +position in each frame to improve generalization, then resized to sub-images +and rearranged into a pre-defined layout as the thumbnail. TALL is +model-agnostic and extremely simple by only modifying a few lines of code. +Inspired by the success of vision transformers, we incorporate TALL into Swin +Transformer, forming an efficient and effective method TALL-Swin. Extensive +experiments on intra-dataset and cross-dataset validate the validity and +superiority of TALL and SOTA TALL-Swin. TALL-Swin achieves 90.79$\%$ AUC on the +challenging cross-dataset task, FaceForensics++ $\to$ Celeb-DF. The code is +available at https://github.com/rainy-xu/TALL4Deepfake. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Intracranial Hemorrhage Segmentation using Head-Wise + Gradient-Infused Self-Attention Maps from a Swin Transformer in Categorical + Learning + + +
+ Intracranial hemorrhage (ICH) is a life-threatening medical emergency that +requires timely and accurate diagnosis for effective treatment and improved +patient survival rates. While deep learning techniques have emerged as the +leading approach for medical image analysis and processing, the most commonly +employed supervised learning often requires large, high-quality annotated +datasets that can be costly to obtain, particularly for pixel/voxel-wise image +segmentation. To address this challenge and facilitate ICH treatment decisions, +we introduce a novel weakly supervised method for ICH segmentation, utilizing a +Swin transformer trained on an ICH classification task with categorical labels. +Our approach leverages a hierarchical combination of head-wise gradient-infused +self-attention maps to generate accurate image segmentation. Additionally, we +conducted an exploratory study on different learning strategies and showed that +binary ICH classification has a more positive impact on self-attention maps +compared to full ICH subtyping. With a mean Dice score of 0.44, our technique +achieved similar ICH segmentation performance as the popular U-Net and +Swin-UNETR models with full supervision and outperformed a similar weakly +supervised approach using GradCAM, demonstrating the excellent potential of the +proposed framework in challenging medical image segmentation tasks. Our code is +available at https://github.com/HealthX-Lab/HGI-SAM. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2023:012 +
+
+
+
+
+ + ♻ ☆ TeViS:Translating Text Synopses to Video Storyboards + + +
+ A video storyboard is a roadmap for video creation which consists of +shot-by-shot images to visualize key plots in a text synopsis. Creating video +storyboards, however, remains challenging which not only requires cross-modal +association between high-level texts and images but also demands long-term +reasoning to make transitions smooth across shots. In this paper, we propose a +new task called Text synopsis to Video Storyboard (TeViS) which aims to +retrieve an ordered sequence of images as the video storyboard to visualize the +text synopsis. We construct a MovieNet-TeViS dataset based on the public +MovieNet dataset. It contains 10K text synopses each paired with keyframes +manually selected from corresponding movies by considering both relevance and +cinematic coherence. To benchmark the task, we present strong CLIP-based +baselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images +into a joint embedding space and uses vector quantization (VQ) to improve the +visual representation. Then, it auto-regressively generates a sequence of +visual features for retrieval and ordering. Experimental results demonstrate +that VQ-Trans significantly outperforms prior methods and the CLIP-based +baselines. Nevertheless, there is still a large gap compared to human +performance suggesting room for promising future work. The code and data are +available at: \url{https://ruc-aimind.github.io/projects/TeViS/} + +
+
+ comment: Accepted to ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ Atlas-Based Interpretable Age Prediction In Whole-Body MR Images + + +
+ Age prediction is an important part of medical assessments and research. It +can aid in detecting diseases as well as abnormal ageing by highlighting the +discrepancy between chronological and biological age. To gain a comprehensive +understanding of age-related changes observed in various body parts, we +investigate them on a larger scale by using whole-body images. We utilise the +Grad-CAM interpretability method to determine the body areas most predictive of +a person's age. We expand our analysis beyond individual subjects by employing +registration techniques to generate population-wide interpretability maps. +Furthermore, we set state-of-the-art whole-body age prediction with a model +that achieves a mean absolute error of 2.76 years. Our findings reveal three +primary areas of interest: the spine, the autochthonous back muscles, and the +cardiac region, which exhibits the highest importance. + +
+
+
+
+
+ + ♻ ☆ Unified and Dynamic Graph for Temporal Character Grouping in Long Videos + + +
+ Video temporal character grouping locates appearing moments of major +characters within a video according to their identities. To this end, recent +works have evolved from unsupervised clustering to graph-based supervised +clustering. However, graph methods are built upon the premise of fixed affinity +graphs, bringing many inexact connections. Besides, they extract multi-modal +features with kinds of models, which are unfriendly to deployment. In this +paper, we present a unified and dynamic graph (UniDG) framework for temporal +character grouping. This is accomplished firstly by a unified representation +network that learns representations of multiple modalities within the same +space and still preserves the modality's uniqueness simultaneously. Secondly, +we present a dynamic graph clustering where the neighbors of different +quantities are dynamically constructed for each node via a cyclic matching +strategy, leading to a more reliable affinity graph. Thirdly, a progressive +association method is introduced to exploit spatial and temporal contexts among +different modalities, allowing multi-modal clustering results to be well fused. +As current datasets only provide pre-extracted features, we evaluate our UniDG +method on a collected dataset named MTCG, which contains each character's +appearing clips of face and body and speaking voice tracks. We also evaluate +our key components on existing clustering and retrieval datasets to verify the +generalization ability. Experimental results manifest that our method can +achieve promising results and outperform several state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ A Unified Query-based Paradigm for Camouflaged Instance Segmentation ACM MM2023 + + +
+ Due to the high similarity between camouflaged instances and the background, +the recently proposed camouflaged instance segmentation (CIS) faces challenges +in accurate localization and instance segmentation. To this end, inspired by +query-based transformers, we propose a unified query-based multi-task learning +framework for camouflaged instance segmentation, termed UQFormer, which builds +a set of mask queries and a set of boundary queries to learn a shared composed +query representation and efficiently integrates global camouflaged object +region and boundary cues, for simultaneous instance segmentation and instance +boundary detection in camouflaged scenarios. Specifically, we design a composed +query learning paradigm that learns a shared representation to capture object +region and boundary features by the cross-attention interaction of mask queries +and boundary queries in the designed multi-scale unified learning transformer +decoder. Then, we present a transformer-based multi-task learning framework for +simultaneous camouflaged instance segmentation and camouflaged instance +boundary detection based on the learned composed query representation, which +also forces the model to learn a strong instance-level query representation. +Notably, our model views the instance segmentation as a query-based direct set +prediction problem, without other post-processing such as non-maximal +suppression. Compared with 14 state-of-the-art approaches, our UQFormer +significantly improves the performance of camouflaged instance segmentation. +Our code will be available at https://github.com/dongbo811/UQFormer. + +
+
+ comment: This paper has been accepted by ACM MM2023 +
+
+
+
+
+ + ♻ ☆ Fast Neural Scene Flow + + +
+ Neural Scene Flow Prior (NSFP) is of significant interest to the vision +community due to its inherent robustness to out-of-distribution (OOD) effects +and its ability to deal with dense lidar points. The approach utilizes a +coordinate neural network to estimate scene flow at runtime, without any +training. However, it is up to 100 times slower than current state-of-the-art +learning methods. In other applications such as image, video, and radiance +function reconstruction innovations in speeding up the runtime performance of +coordinate networks have centered upon architectural changes. In this paper, we +demonstrate that scene flow is different -- with the dominant computational +bottleneck stemming from the loss function itself (i.e., Chamfer distance). +Further, we rediscover the distance transform (DT) as an efficient, +correspondence-free loss function that dramatically speeds up the runtime +optimization. Our fast neural scene flow (FNSF) approach reports for the first +time real-time performance comparable to learning methods, without any training +or OOD bias on two of the largest open autonomous driving (AV) lidar datasets +Waymo Open and Argoverse. + +
+
+ comment: 17 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Ensemble of Anchor-Free Models for Robust Bangla Document Layout + Segmentation + + +
+ In this research paper, we introduce a novel approach designed for the +purpose of segmenting the layout of Bangla documents. Our methodology involves +the utilization of a sophisticated ensemble of YOLOv8 models, which were +trained for the DL Sprint 2.0 - BUET CSE Fest 2023 Competition focused on +Bangla document layout segmentation. Our primary emphasis lies in enhancing +various aspects of the task, including techniques such as image augmentation, +model architecture, and the incorporation of model ensembles. We deliberately +reduce the quality of a subset of document images to enhance the resilience of +model training, thereby resulting in an improvement in our cross-validation +score. By employing Bayesian optimization, we determine the optimal confidence +and Intersection over Union (IoU) thresholds for our model ensemble. Through +our approach, we successfully demonstrate the effectiveness of anchor-free +models in achieving robust layout segmentation in Bangla documents. + +
+
+ comment: 4 pages, 5 figures, 6 Tables +
+
+
+
+
+ + ♻ ☆ HeadSculpt: Crafting 3D Head Avatars with Text + + +
+ Recently, text-guided 3D generative methods have made remarkable advancements +in producing high-quality textures and geometry, capitalizing on the +proliferation of large vision-language and image diffusion models. However, +existing methods still struggle to create high-fidelity 3D head avatars in two +aspects: (1) They rely mostly on a pre-trained text-to-image diffusion model +whilst missing the necessary 3D awareness and head priors. This makes them +prone to inconsistency and geometric distortions in the generated avatars. (2) +They fall short in fine-grained editing. This is primarily due to the inherited +limitations from the pre-trained 2D image diffusion models, which become more +pronounced when it comes to 3D head avatars. In this work, we address these +challenges by introducing a versatile coarse-to-fine pipeline dubbed HeadSculpt +for crafting (i.e., generating and editing) 3D head avatars from textual +prompts. Specifically, we first equip the diffusion model with 3D awareness by +leveraging landmark-based control and a learned textual embedding representing +the back view appearance of heads, enabling 3D-consistent head avatar +generations. We further propose a novel identity-aware editing score +distillation strategy to optimize a textured mesh with a high-resolution +differentiable rendering technique. This enables identity preservation while +following the editing instruction. We showcase HeadSculpt's superior fidelity +and editing capabilities through comprehensive experiments and comparisons with +existing methods. + +
+
+ comment: Webpage: https://brandonhan.uk/HeadSculpt/ +
+
+
+
+
+ + ♻ ☆ Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation + for Pretrained Deep Generative Model CVPR2023 + + +
+ Semantic editing of images is the fundamental goal of computer vision. +Although deep learning methods, such as generative adversarial networks (GANs), +are capable of producing high-quality images, they often do not have an +inherent way of editing generated images semantically. Recent studies have +investigated a way of manipulating the latent variable to determine the images +to be generated. However, methods that assume linear semantic arithmetic have +certain limitations in terms of the quality of image editing, whereas methods +that discover nonlinear semantic pathways provide non-commutative editing, +which is inconsistent when applied in different orders. This study proposes a +novel method called deep curvilinear editing (DeCurvEd) to determine semantic +commuting vector fields on the latent space. We theoretically demonstrate that +owing to commutativity, the editing of multiple attributes depends only on the +quantities and not on the order. Furthermore, we experimentally demonstrate +that compared to previous methods, the nonlinear and commutative nature of +DeCurvEd facilitates the disentanglement of image attributes and provides +higher-quality editing. + +
+
+ comment: 15 pages. The last update made no changes except for adding the + following link to the CVF repository: + https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html. + Here, you can find our code to reproduce our results +
+
+
+
+
+ + ♻ ☆ Learning Clothing and Pose Invariant 3D Shape Representation for + Long-Term Person Re-Identification ICCV 2023 + + +
+ Long-Term Person Re-Identification (LT-ReID) has become increasingly crucial +in computer vision and biometrics. In this work, we aim to extend LT-ReID +beyond pedestrian recognition to include a wider range of real-world human +activities while still accounting for cloth-changing scenarios over large time +gaps. This setting poses additional challenges due to the geometric +misalignment and appearance ambiguity caused by the diversity of human pose and +clothing. To address these challenges, we propose a new approach 3DInvarReID +for (i) disentangling identity from non-identity components (pose, clothing +shape, and texture) of 3D clothed humans, and (ii) reconstructing accurate 3D +clothed body shapes and learning discriminative features of naked body shapes +for person ReID in a joint manner. To better evaluate our study of LT-ReID, we +collect a real-world dataset called CCDA, which contains a wide variety of +human activities and clothing changes. Experimentally, we show the superior +performance of our approach for person ReID. + +
+
+ comment: 10 pages, 7 figures, accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR + Semantic Segmentation ICCV 2023 + + +
+ The ability to deploy robots that can operate safely in diverse environments +is crucial for developing embodied intelligent agents. As a community, we have +made tremendous progress in within-domain LiDAR semantic segmentation. However, +do these methods generalize across domains? To answer this question, we design +the first experimental setup for studying domain generalization (DG) for LiDAR +semantic segmentation (DG-LSS). Our results confirm a significant gap between +methods, evaluated in a cross-domain setting: for example, a model trained on +the source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data, +compared to $48.49$ mIoU obtained by the model trained on the target domain +(nuScenes). To tackle this gap, we propose the first method specifically +designed for DG-LSS, which obtains $34.88$ mIoU on the target domain, +outperforming all baselines. Our method augments a sparse-convolutional +encoder-decoder 3D segmentation network with an additional, dense 2D +convolutional decoder that learns to classify a birds-eye view of the point +cloud. This simple auxiliary task encourages the 3D network to learn features +that are robust to sensor placement shifts and resolution, and are transferable +across domains. With this work, we aim to inspire the community to develop and +evaluate future models in such cross-domain conditions. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Colab NAS: Obtaining lightweight task-specific convolutional neural + networks following Occam's razor + + +
+ The current trend of applying transfer learning from convolutional neural +networks (CNNs) trained on large datasets can be an overkill when the target +application is a custom and delimited problem, with enough data to train a +network from scratch. On the other hand, the training of custom and lighter +CNNs requires expertise, in the from-scratch case, and or high-end resources, +as in the case of hardware-aware neural architecture search (HW NAS), limiting +access to the technology by non-habitual NN developers. + For this reason, we present ColabNAS, an affordable HW NAS technique for +producing lightweight task-specific CNNs. Its novel derivative-free search +strategy, inspired by Occam's razor, allows to obtain state-of-the-art results +on the Visual Wake Word dataset, a standard TinyML benchmark, in just 3.1 GPU +hours using free online GPU services such as Google Colaboratory and Kaggle +Kernel. + +
+
+
+
+
+ + ♻ ☆ Sat2Density: Faithful Density Learning from Satellite-Ground Image Pairs ICCV 2023 + + +
+ This paper aims to develop an accurate 3D geometry representation of +satellite images using satellite-ground image pairs. Our focus is on the +challenging problem of 3D-aware ground-views synthesis from a satellite image. +We draw inspiration from the density field representation used in volumetric +neural rendering and propose a new approach, called Sat2Density. Our method +utilizes the properties of ground-view panoramas for the sky and non-sky +regions to learn faithful density fields of 3D scenes in a geometric +perspective. Unlike other methods that require extra depth information during +training, our Sat2Density can automatically learn accurate and faithful 3D +geometry via density representation without depth supervision. This advancement +significantly improves the ground-view panorama synthesis task. Additionally, +our study provides a new geometric perspective to understand the relationship +between satellite and ground-view images in 3D space. + +
+
+ comment: ICCV 2023, project page: https://sat2density.github.io/, code: + https://github.com/qianmingduowan/Sat2Density +
+
+
+
+
+ + ♻ ☆ Compositional Semantic Mix for Domain Adaptation in Point Cloud + Segmentation + + +
+ Deep-learning models for 3D point cloud semantic segmentation exhibit limited +generalization capabilities when trained and tested on data captured with +different sensors or in varying environments due to domain shift. Domain +adaptation methods can be employed to mitigate this domain shift, for instance, +by simulating sensor noise, developing domain-agnostic generators, or training +point cloud completion networks. Often, these methods are tailored for range +view maps or necessitate multi-modal input. In contrast, domain adaptation in +the image domain can be executed through sample mixing, which emphasizes input +data manipulation rather than employing distinct adaptation modules. In this +study, we introduce compositional semantic mixing for point cloud domain +adaptation, representing the first unsupervised domain adaptation technique for +point cloud segmentation based on semantic and geometric sample mixing. We +present a two-branch symmetric network architecture capable of concurrently +processing point clouds from a source domain (e.g. synthetic) and point clouds +from a target domain (e.g. real-world). Each branch operates within one domain +by integrating selected data fragments from the other domain and utilizing +semantic information derived from source labels and target (pseudo) labels. +Additionally, our method can leverage a limited number of human point-level +annotations (semi-supervised) to further enhance performance. We assess our +approach in both synthetic-to-real and real-to-real scenarios using LiDAR +datasets and demonstrate that it significantly outperforms state-of-the-art +methods in both unsupervised and semi-supervised settings. + +
+
+ comment: TPAMI. arXiv admin note: text overlap with arXiv:2207.09778 +
+
+
+
+
+ + ♻ ☆ Cross-Domain Few-Shot Classification via Inter-Source Stylization + + +
+ The goal of Cross-Domain Few-Shot Classification (CDFSC) is to accurately +classify a target dataset with limited labelled data by exploiting the +knowledge of a richly labelled auxiliary dataset, despite the differences +between the domains of the two datasets. Some existing approaches require +labelled samples from multiple domains for model training. However, these +methods fail when the sample labels are scarce. To overcome this challenge, +this paper proposes a solution that makes use of multiple source domains +without the need for additional labeling costs. Specifically, one of the source +domains is completely tagged, while the others are untagged. An Inter-Source +Stylization Network (ISSNet) is then introduced to enhance stylisation across +multiple source domains, enriching data distribution and model's generalization +capabilities. Experiments on 8 target datasets show that ISSNet leverages +unlabelled data from multiple source data and significantly reduces the +negative impact of domain gaps on classification performance compared to +several baseline methods. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ Open Gaze: Open Source eye tracker for smartphone devices using Deep + Learning + + +
+ Eye tracking has been a pivotal tool in diverse fields such as vision +research, language analysis, and usability assessment. The majority of prior +investigations, however, have concentrated on expansive desktop displays +employing specialized, costly eye tracking hardware that lacks scalability. +Remarkably little insight exists into ocular movement patterns on smartphones, +despite their widespread adoption and significant usage. In this manuscript, we +present an open-source implementation of a smartphone-based gaze tracker that +emulates the methodology proposed by a GooglePaper (whose source code remains +proprietary). Our focus is on attaining accuracy comparable to that attained +through the GooglePaper's methodology, without the necessity for supplementary +hardware. Through the integration of machine learning techniques, we unveil an +accurate eye tracking solution that is native to smartphones. Our approach +demonstrates precision akin to the state-of-the-art mobile eye trackers, which +are characterized by a cost that is two orders of magnitude higher. Leveraging +the vast MIT GazeCapture dataset, which is available through registration on +the dataset's website, we successfully replicate crucial findings from previous +studies concerning ocular motion behavior in oculomotor tasks and saliency +analyses during natural image observation. Furthermore, we emphasize the +applicability of smartphone-based gaze tracking in discerning reading +comprehension challenges. Our findings exhibit the inherent potential to +amplify eye movement research by significant proportions, accommodating +participation from thousands of subjects with explicit consent. This +scalability not only fosters advancements in vision research, but also extends +its benefits to domains such as accessibility enhancement and healthcare +applications. + +
+
+ comment: 26 pages , 15 figures +
+
+
+
+
+ + ♻ ☆ Learning A Coarse-to-Fine Diffusion Transformer for Image Restoration + + +
+ Recent years have witnessed the remarkable performance of diffusion models in +various vision tasks. However, for image restoration that aims to recover clear +images with sharper details from given degraded observations, diffusion-based +methods may fail to recover promising results due to inaccurate noise +estimation. Moreover, simple constraining noises cannot effectively learn +complex degradation information, which subsequently hinders the model capacity. +To solve the above problems, we propose a coarse-to-fine diffusion Transformer +(C2F-DFT) for image restoration. Specifically, our C2F-DFT contains diffusion +self-attention (DFSA) and diffusion feed-forward network (DFN) within a new +coarse-to-fine training scheme. The DFSA and DFN respectively capture the +long-range diffusion dependencies and learn hierarchy diffusion representation +to facilitate better restoration. In the coarse training stage, our C2F-DFT +estimates noises and then generates the final clean image by a sampling +algorithm. To further improve the restoration quality, we propose a simple yet +effective fine training scheme. It first exploits the coarse-trained diffusion +model with fixed steps to generate restoration results, which then would be +constrained with corresponding ground-truth ones to optimize the models to +remedy the unsatisfactory results affected by inaccurate noise estimation. +Extensive experiments show that C2F-DFT significantly outperforms +diffusion-based restoration method IR-SDE and achieves competitive performance +compared with Transformer-based state-of-the-art methods on $3$ tasks, +including deraining, deblurring, and real denoising. The code is available at +https://github.com/wlydlut/C2F-DFT. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Efficient Representation of Natural Image Patches + + +
+ In the complex domain of neural information processing, discerning +fundamental principles from ancillary details remains a significant challenge. +While there is extensive knowledge about the anatomy and physiology of the +early visual system, a comprehensive computational theory remains elusive. Can +we gain insights into the underlying principles of a biological system by +abstracting away from its detailed implementation and focusing on the +fundamental problems that the system is designed to solve? Utilizing an +abstract model based on minimal yet realistic assumptions, we show how to +achieve the early visual system's two ultimate objectives: efficient +information transmission and sensor probability distribution modeling. We show +that optimizing for information transmission does not yield optimal probability +distribution modeling. We illustrate, using a two-pixel (2D) system and image +patches, that an efficient representation can be realized via nonlinear +population code driven by two types of biologically plausible loss functions +that depend solely on output. After unsupervised learning, our abstract IPU +model bears remarkable resemblances to biological systems, despite not +mimicking many features of real neurons, such as spiking activity. A +preliminary comparison with a contemporary deep learning model suggests that +the IPU model offers a significant efficiency advantage. Our model provides +novel insights into the computational theory of early visual systems as well as +a potential new approach to enhance the efficiency of deep learning models. + +
+
+
+
+
+ + ♻ ☆ IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable + Novel View Synthesis ICCV2023 + + +
+ Existing inverse rendering combined with neural rendering methods can only +perform editable novel view synthesis on object-specific scenes, while we +present intrinsic neural radiance fields, dubbed IntrinsicNeRF, which introduce +intrinsic decomposition into the NeRF-based neural rendering method and can +extend its application to room-scale scenes. Since intrinsic decomposition is a +fundamentally under-constrained inverse problem, we propose a novel +distance-aware point sampling and adaptive reflectance iterative clustering +optimization method, which enables IntrinsicNeRF with traditional intrinsic +decomposition constraints to be trained in an unsupervised manner, resulting in +multi-view consistent intrinsic decomposition results. To cope with the problem +that different adjacent instances of similar reflectance in a scene are +incorrectly clustered together, we further propose a hierarchical clustering +method with coarse-to-fine optimization to obtain a fast hierarchical indexing +representation. It supports compelling real-time augmented applications such as +recoloring and illumination variation. Extensive experiments and editing +samples on both object-specific/room-scale scenes and synthetic/real-word data +demonstrate that we can obtain consistent intrinsic decomposition results and +high-fidelity novel view synthesis even for challenging sequences. + +
+
+ comment: Accepted to ICCV2023, Project webpage: + https://zju3dv.github.io/intrinsic_nerf/, code: + https://github.com/zju3dv/IntrinsicNeRF +
+
+
+
+
+ + ♻ ☆ WALDO: Future Video Synthesis using Object Layer Decomposition and + Parametric Flow Prediction ICCV 2023 + + +
+ This paper presents WALDO (WArping Layer-Decomposed Objects), a novel +approach to the prediction of future video frames from past ones. Individual +images are decomposed into multiple layers combining object masks and a small +set of control points. The layer structure is shared across all frames in each +video to build dense inter-frame connections. Complex scene motions are modeled +by combining parametric geometric transformations associated with individual +layers, and video synthesis is broken down into discovering the layers +associated with past frames, predicting the corresponding transformations for +upcoming ones and warping the associated object regions accordingly, and +filling in the remaining image parts. Extensive experiments on multiple +benchmarks including urban videos (Cityscapes and KITTI) and videos featuring +nonrigid motions (UCF-Sports and H3.6M), show that our method consistently +outperforms the state of the art by a significant margin in every case. Code, +pretrained models, and video samples synthesized by our approach can be found +in the project webpage https://16lemoing.github.io/waldo. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Sample4Geo: Hard Negative Sampling For Cross-View Geo-Localisation + + +
+ Cross-View Geo-Localisation is still a challenging task where additional +modules, specific pre-processing or zooming strategies are necessary to +determine accurate positions of images. Since different views have different +geometries, pre-processing like polar transformation helps to merge them. +However, this results in distorted images which then have to be rectified. +Adding hard negatives to the training batch could improve the overall +performance but with the default loss functions in geo-localisation it is +difficult to include them. In this article, we present a simplified but +effective architecture based on contrastive learning with symmetric InfoNCE +loss that outperforms current state-of-the-art results. Our framework consists +of a narrow training pipeline that eliminates the need of using aggregation +modules, avoids further pre-processing steps and even increases the +generalisation capability of the model to unknown regions. We introduce two +types of sampling strategies for hard negatives. The first explicitly exploits +geographically neighboring locations to provide a good starting point. The +second leverages the visual similarity between the image embeddings in order to +mine hard negative samples. Our work shows excellent performance on common +cross-view datasets like CVUSA, CVACT, University-1652 and VIGOR. A comparison +between cross-area and same-area settings demonstrate the good generalisation +capability of our model. + +
+
+
+
+
+ + ♻ ☆ Confidence Attention and Generalization Enhanced Distillation for + Continuous Video Domain Adaptation + + +
+ Continuous Video Domain Adaptation (CVDA) is a scenario where a source model +is required to adapt to a series of individually available changing target +domains continuously without source data or target supervision. It has wide +applications, such as robotic vision and autonomous driving. The main +underlying challenge of CVDA is to learn helpful information only from the +unsupervised target data while avoiding forgetting previously learned knowledge +catastrophically, which is out of the capability of previous Video-based +Unsupervised Domain Adaptation methods. Therefore, we propose a +Confidence-Attentive network with geneRalization enhanced self-knowledge +disTillation (CART) to address the challenge in CVDA. Firstly, to learn from +unsupervised domains, we propose to learn from pseudo labels. However, in +continuous adaptation, prediction errors can accumulate rapidly in pseudo +labels, and CART effectively tackles this problem with two key modules. +Specifically, The first module generates refined pseudo labels using model +predictions and deploys a novel attentive learning strategy. The second module +compares the outputs of augmented data from the current model to the outputs of +weakly augmented data from the source model, forming a novel consistency +regularization on the model to alleviate the accumulation of prediction errors. +Extensive experiments suggest that the CVDA performance of CART outperforms +existing methods by a considerable margin. + +
+
+ comment: 16 pages, 9 tables, 10 figures +
+
+
+
+
+ + ♻ ☆ A Conditional Denoising Diffusion Probabilistic Model for Radio + Interferometric Image Reconstruction ECAI 2023 + + +
+ In radio astronomy, signals from radio telescopes are transformed into images +of observed celestial objects, or sources. However, these images, called dirty +images, contain real sources as well as artifacts due to signal sparsity and +other factors. Therefore, radio interferometric image reconstruction is +performed on dirty images, aiming to produce clean images in which artifacts +are reduced and real sources are recovered. So far, existing methods have +limited success on recovering faint sources, preserving detailed structures, +and eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and +Image Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to +use both the original visibility data in the spectral domain and dirty images +in the spatial domain to guide the image generation process with DDPM. This +way, we can leverage DDPM to generate fine details and eliminate noise, while +utilizing visibility data to separate signals from noise and retaining spatial +information in dirty images. We have conducted experiments in comparison with +both traditional methods and recent deep learning based approaches. Our results +show that our method significantly improves the resulting images by reducing +artifacts, preserving fine details, and recovering dim sources. This +advancement further facilitates radio astronomical data analysis tasks on +celestial phenomena. + +
+
+ comment: Accepted by ECAI 2023 +
+
+
+
+
+ + ♻ ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification + + +
+ With the growth of 3D sensing technology, deep learning system for 3D point +clouds has become increasingly important, especially in applications like +autonomous vehicles where safety is a primary concern. However, there are also +growing concerns about the reliability of these systems when they encounter +noisy point clouds, whether occurring naturally or introduced with malicious +intent. This paper highlights the challenges of point cloud classification +posed by various forms of noise, from simple background noise to malicious +backdoor attacks that can intentionally skew model predictions. While there's +an urgent need for optimized point cloud denoising, current point outlier +removal approaches, an essential step for denoising, rely heavily on +handcrafted strategies and are not adapted for higher-level tasks, such as +classification. To address this issue, we introduce an innovative point outlier +cleansing method that harnesses the power of downstream classification models. +By employing gradient-based attribution analysis, we define a novel concept: +point risk. Drawing inspiration from tail risk minimization in finance, we +recast the outlier removal process as an optimization problem, named PointCVaR. +Extensive experiments show that our proposed technique not only robustly +filters diverse point cloud outliers but also consistently and significantly +enhances existing robust methods for point cloud classification. + +
+
+
+
+
+ + ♻ ☆ DiffusionDepth: Diffusion Denoising Approach for Monocular Depth + Estimation + + +
+ Monocular depth estimation is a challenging task that predicts the pixel-wise +depth from a single 2D image. Current methods typically model this problem as a +regression or classification task. We propose DiffusionDepth, a new approach +that reformulates monocular depth estimation as a denoising diffusion process. +It learns an iterative denoising process to `denoise' random depth distribution +into a depth map with the guidance of monocular visual conditions. The process +is performed in the latent space encoded by a dedicated depth encoder and +decoder. Instead of diffusing ground truth (GT) depth, the model learns to +reverse the process of diffusing the refined depth of itself into random depth +distribution. This self-diffusion formulation overcomes the difficulty of +applying generative models to sparse GT depth scenarios. The proposed approach +benefits this task by refining depth estimation step by step, which is superior +for generating accurate and highly detailed depth maps. Experimental results on +KITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion +approach could reach state-of-the-art performance in both indoor and outdoor +scenarios with acceptable inference time. + +
+
+
+
+
+ + ♻ ☆ Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential + Generative Adversarial Networks + + +
+ In this paper, we propose a bi-modality medical image synthesis approach +based on sequential generative adversarial network (GAN) and semi-supervised +learning. Our approach consists of two generative modules that synthesize +images of the two modalities in a sequential order. A method for measuring the +synthesis complexity is proposed to automatically determine the synthesis order +in our sequential GAN. Images of the modality with a lower complexity are +synthesized first, and the counterparts with a higher complexity are generated +later. Our sequential GAN is trained end-to-end in a semi-supervised manner. In +supervised training, the joint distribution of bi-modality images are learned +from real paired images of the two modalities by explicitly minimizing the +reconstruction losses between the real and synthetic images. To avoid +overfitting limited training images, in unsupervised training, the marginal +distribution of each modality is learned based on unpaired images by minimizing +the Wasserstein distance between the distributions of real and fake images. We +comprehensively evaluate the proposed model using two synthesis tasks based on +three types of evaluate metrics and user studies. Visual and quantitative +results demonstrate the superiority of our method to the state-of-the-art +methods, and reasonable visual quality and clinical significance. Code is made +publicly available at +https://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis. + +
+
+
+
+
+ + ♻ ☆ Anticipating Driving Behavior through Deep Learning-Based Policy + Prediction + + +
+ In this endeavor, we developed a comprehensive system that processes +integrated visual features derived from video frames captured by a regular +camera, along with depth details obtained from a point cloud scanner. This +system is designed to anticipate driving actions, encompassing both vehicle +speed and steering angle. To ensure its reliability, we conducted assessments +where we juxtaposed the projected outcomes with the established norms adhered +to by skilled real-world drivers. Our evaluation outcomes indicate that the +forecasts achieve a noteworthy level of accuracy in a minimum of half the test +scenarios (ranging around 50-80%, contingent on the specific model). Notably, +the utilization of amalgamated features yielded superior performance in +comparison to using video frames in isolation, as demonstrated by most of the +cases. + +
+
+ comment: 5 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with + Prompt-based Finetuning + + +
+ The Segment Anything Model (SAM) is a recently proposed prompt-based +segmentation model in a generic zero-shot segmentation approach. With the +zero-shot segmentation capacity, SAM achieved impressive flexibility and +precision on various segmentation tasks. However, the current pipeline requires +manual prompts during the inference stage, which is still resource intensive +for biomedical image segmentation. In this paper, instead of using prompts +during the inference stage, we introduce a pipeline that utilizes the SAM, +called all-in-SAM, through the entire AI development workflow (from annotation +generation to model finetuning) without requiring manual prompts during the +inference stage. Specifically, SAM is first employed to generate pixel-level +annotations from weak prompts (e.g., points, bounding box). Then, the +pixel-level annotations are used to finetune the SAM segmentation model rather +than training from scratch. Our experimental results reveal two key findings: +1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a +nuclei segmentation task on the public Monuseg dataset, and 2) the utilization +of weak and few annotations for SAM finetuning achieves competitive performance +compared to using strong pixel-wise annotated data. + +
+
+
+
+
+ + ♻ ☆ High-Resolution Document Shadow Removal via A Large-Scale Real-World + Dataset and A Frequency-Aware Shadow Erasing Net ICCV2023 + + +
+ Shadows often occur when we capture the documents with casual equipment, +which influences the visual quality and readability of the digital copies. +Different from the algorithms for natural shadow removal, the algorithms in +document shadow removal need to preserve the details of fonts and figures in +high-resolution input. Previous works ignore this problem and remove the +shadows via approximate attention and small datasets, which might not work in +real-world situations. We handle high-resolution document shadow removal +directly via a larger-scale real-world dataset and a carefully designed +frequency-aware network. As for the dataset, we acquire over 7k couples of +high-resolution (2462 x 3699) images of real-world document pairs with various +samples under different lighting circumstances, which is 10 times larger than +existing datasets. As for the design of the network, we decouple the +high-resolution images in the frequency domain, where the low-frequency details +and high-frequency boundaries can be effectively learned via the carefully +designed network structure. Powered by our network and dataset, the proposed +method clearly shows a better performance than previous methods in terms of +visual quality and numerical results. The code, models, and dataset are +available at: https://github.com/CXH-Research/DocShadow-SD7K + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ ROSIA: Rotation-Search-Based Star Identification Algorithm + + +
+ This paper presents a rotation-search-based approach for addressing the star +identification (Star-ID) problem. The proposed algorithm, ROSIA, is a +heuristics-free algorithm that seeks the optimal rotation that maximally aligns +the input and catalog stars in their respective coordinates. ROSIA searches the +rotation space systematically with the Branch-and-Bound (BnB) method. Crucially +affecting the runtime feasibility of ROSIA is the upper bound function that +prioritizes the search space. In this paper, we make a theoretical contribution +by proposing a tight (provable) upper bound function that enables a 400x +speed-up compared to an existing formulation. Coupling the bounding function +with an efficient evaluation scheme that leverages stereographic projection and +the R-tree data structure, ROSIA achieves feasible operational speed on +embedded processors with state-of-the-art performances under different sources +of noise. The source code of ROSIA is available at +https://github.com/ckchng/ROSIA. + +
+
+ comment: 21 pages, 16 figures, Accepted to IEEE Transactions on Aerospace and + Electronic Systems +
+
+
+
+
+ + ♻ ☆ Real-time Strawberry Detection Based on Improved YOLOv5s Architecture + for Robotic Harvesting in open-field environment + + +
+ This study proposed a YOLOv5-based custom object detection model to detect +strawberries in an outdoor environment. The original architecture of the +YOLOv5s was modified by replacing the C3 module with the C2f module in the +backbone network, which provided a better feature gradient flow. Secondly, the +Spatial Pyramid Pooling Fast in the final layer of the backbone network of +YOLOv5s was combined with Cross Stage Partial Net to improve the generalization +ability over the strawberry dataset in this study. The proposed architecture +was named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with +three maturity classes (immature, nearly mature, and mature) was collected in +open-field environment and augmented through a series of operations including +brightness reduction, brightness increase, and noise adding. To verify the +superiority of the proposed method for strawberry detection in open-field +environment, four competitive detection models (YOLOv3-tiny, YOLOv5s, +YOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational +environment and compared with YOLOv5s-Straw. The results showed that the +highest mean average precision of 80.3% was achieved using the proposed +architecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s, +YOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively. +Specifically, the average precision of YOLOv5s-Straw was 82.1% in the immature +class, 73.5% in the nearly mature class, and 86.6% in the mature class, which +were 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The +model included 8.6*10^6 network parameters with an inference speed of 18ms per +image while the inference speed of YOLOv8s had a slower inference speed of +21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed +model is fast enough for real time strawberry detection and localization for +the robotic picking. + +
+
+ comment: 20 pages; 15 figures +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems: + Challenges and Opportunities + + +
+ Multi-Sensor Fusion (MSF) based perception systems have been the foundation +in supporting many industrial applications and domains, such as self-driving +cars, robotic arms, and unmanned aerial vehicles. Over the past few years, the +fast progress in data-driven artificial intelligence (AI) has brought a +fast-increasing trend to empower MSF systems by deep learning techniques to +further improve performance, especially on intelligent systems and their +perception systems. Although quite a few AI-enabled MSF perception systems and +techniques have been proposed, up to the present, limited benchmarks that focus +on MSF perception are publicly available. Given that many intelligent systems +such as self-driving cars are operated in safety-critical contexts where +perception systems play an important role, there comes an urgent need for a +more in-depth understanding of the performance and reliability of these MSF +systems. To bridge this gap, we initiate an early step in this direction and +construct a public benchmark of AI-enabled MSF-based perception systems +including three commonly adopted tasks (i.e., object detection, object +tracking, and depth completion). Based on this, to comprehensively understand +MSF systems' robustness and reliability, we design 14 common and realistic +corruption patterns to synthesize large-scale corrupted datasets. We further +perform a systematic evaluation of these systems through our large-scale +evaluation. Our results reveal the vulnerability of the current AI-enabled MSF +perception systems, calling for researchers and practitioners to take +robustness and reliability into account when designing AI-enabled MSF. + +
+
+ comment: To appear in ESEC/FSE 2023 +
+
+
+
+
+ + ♻ ☆ Streaming Object Detection on Fisheye Cameras for Automatic Parking + + +
+ Fisheye cameras are widely employed in automatic parking, and the video +stream object detection (VSOD) of the fisheye camera is a fundamental +perception function to ensure the safe operation of vehicles. In past research +work, the difference between the output of the deep learning model and the +actual situation at the current moment due to the existence of delay of the +perception system is generally ignored. But the environment will inevitably +change within the delay time which may cause a potential safety hazard. In this +paper, we propose a real-time detection framework equipped with a dual-flow +perception module (dynamic and static flows) that can predict the future and +alleviate the time-lag problem. Meanwhile, we use a new scheme to evaluate +latency and accuracy. The standard bounding box is unsuitable for the object in +fisheye camera images due to the strong radial distortion of the fisheye camera +and the primary detection objects of parking perception are vehicles and +pedestrians, so we adopt the rotate bounding box and propose a new periodic +angle loss function to regress the angle of the box, which is the simple and +accurate representation method of objects. The instance segmentation ground +truth is used to supervise the training. Experiments demonstrate the +effectiveness of our approach. Code is released at: +https://gitee.com/hiyanyx/fisheye-streaming-perception. + +
+
+
+
+
+ + ♻ ☆ Human from Blur: Human Pose Tracking from Blurry Images + + +
+ We propose a method to estimate 3D human poses from substantially blurred +images. The key idea is to tackle the inverse problem of image deblurring by +modeling the forward problem with a 3D human model, a texture map, and a +sequence of poses to describe human motion. The blurring process is then +modeled by a temporal image aggregation step. Using a differentiable renderer, +we can solve the inverse problem by backpropagating the pixel-wise reprojection +error to recover the best human motion representation that explains a single or +multiple input images. Since the image reconstruction loss alone is +insufficient, we present additional regularization terms. To the best of our +knowledge, we present the first method to tackle this problem. Our method +consistently outperforms other methods on significantly blurry inputs since +they lack one or multiple key functionalities that our method unifies, i.e. +image deblurring with sub-frame accuracy and explicit 3D modeling of non-rigid +human motion. + +
+
+
+
+
+ + ♻ ☆ Parkinson gait modelling from an anomaly deep representation + + +
+ Parkinson's Disease (PD) is associated with gait movement disorders, such as +bradykinesia, stiffness, tremors and postural instability, caused by +progressive dopamine deficiency. Today, some approaches have implemented +learning representations to quantify kinematic patterns during locomotion, +supporting clinical procedures such as diagnosis and treatment planning. These +approaches assumes a large amount of stratified and labeled data to optimize +discriminative representations. Nonetheless these considerations may restrict +the approaches to be operable in real scenarios during clinical practice. This +work introduces a self-supervised generative representation to learn +gait-motion-related patterns, under the pretext of video reconstruction and an +anomaly detection framework. This architecture is trained following a one-class +weakly supervised learning to avoid inter-class variance and approach the +multiple relationships that represent locomotion. The proposed approach was +validated with 14 PD patients and 23 control subjects, and trained with the +control population only, achieving an AUC of 95%, homocedasticity level of 70% +and shapeness level of 70% in the classification task considering its +generalization. + +
+
+ comment: Journal not submitted to any editorial +
+
+
+
+
+ + ♻ ☆ GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content SP + + +
+ The mobile cloud gaming industry has been rapidly growing over the last +decade. When streaming gaming videos are transmitted to customers' client +devices from cloud servers, algorithms that can monitor distorted video quality +without having any reference video available are desirable tools. However, +creating No-Reference Video Quality Assessment (NR VQA) models that can +accurately predict the quality of streaming gaming videos rendered by computer +graphics engines is a challenging problem, since gaming content generally +differs statistically from naturalistic videos, often lacks detail, and +contains many smooth regions. Until recently, the problem has been further +complicated by the lack of adequate subjective quality databases of mobile +gaming content. We have created a new gaming-specific NR VQA model called the +Gaming Video Quality Evaluator (GAMIVAL), which combines and leverages the +advantages of spatial and temporal gaming distorted scene statistics models, a +neural noise model, and deep semantic features. Using a support vector +regression (SVR) as a regressor, GAMIVAL achieves superior performance on the +new LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database. + +
+
+ comment: Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been + made available online: https://github.com/lskdream/GAMIVAL +
+
+
+
+
+ + ♻ ☆ Robust affine point matching via quadratic assignment on Grassmannians + + +
+ Robust Affine matching with Grassmannians (RAG) is a new algorithm to perform +affine registration of point clouds. The algorithm is based on minimizing the +Frobenius distance between two elements of the Grassmannian. For this purpose, +an indefinite relaxation of the Quadratic Assignment Problem (QAP) is used, and +several approaches to affine feature matching are studied and compared. +Experiments demonstrate that RAG is more robust to noise and point discrepancy +than previous methods. + +
+
+ comment: 8 pages, 23 figures; GitHub repository at + (https://github.com/sashakolpakov/rag) +
+
+
+
+
+ + ♻ ☆ GazeGNN: A Gaze-Guided Graph Neural Network for Chest X-ray + Classification WACV 2024 + + +
+ Eye tracking research is important in computer vision because it can help us +understand how humans interact with the visual world. Specifically for +high-risk applications, such as in medical imaging, eye tracking can help us to +comprehend how radiologists and other medical professionals search, analyze, +and interpret images for diagnostic and clinical purposes. Hence, the +application of eye tracking techniques in disease classification has become +increasingly popular in recent years. Contemporary works usually transform gaze +information collected by eye tracking devices into visual attention maps (VAMs) +to supervise the learning process. However, this is a time-consuming +preprocessing step, which stops us from applying eye tracking to radiologists' +daily work. To solve this problem, we propose a novel gaze-guided graph neural +network (GNN), GazeGNN, to leverage raw eye-gaze data without being converted +into VAMs. In GazeGNN, to directly integrate eye gaze into image +classification, we create a unified representation graph that models both +images and gaze pattern information. With this benefit, we develop a real-time, +real-world, end-to-end disease classification algorithm for the first time in +the literature. This achievement demonstrates the practicality and feasibility +of integrating real-time eye tracking techniques into the daily work of +radiologists. To our best knowledge, GazeGNN is the first work that adopts GNN +to integrate image and eye-gaze data. Our experiments on the public chest X-ray +dataset show that our proposed method exhibits the best classification +performance compared to existing methods. The code is available at +https://github.com/ukaukaaaa/GazeGNN. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ CASSPR: Cross Attention Single Scan Place Recognition ICCV2023 + + +
+ Place recognition based on point clouds (LiDAR) is an important component for +autonomous robots or self-driving vehicles. Current SOTA performance is +achieved on accumulated LiDAR submaps using either point-based or voxel-based +structures. While voxel-based approaches nicely integrate spatial context +across multiple scales, they do not exhibit the local precision of point-based +methods. As a result, existing methods struggle with fine-grained matching of +subtle geometric features in sparse single-shot Li- DAR scans. To overcome +these limitations, we propose CASSPR as a method to fuse point-based and +voxel-based approaches using cross attention transformers. CASSPR leverages a +sparse voxel branch for extracting and aggregating information at lower +resolution and a point-wise branch for obtaining fine-grained local +information. CASSPR uses queries from one branch to try to match structures in +the other branch, ensuring that both extract self-contained descriptors of the +point cloud (rather than one branch dominating), but using both to inform the +output global descriptor of the point cloud. Extensive experiments show that +CASSPR surpasses the state-of-the-art by a large margin on several datasets +(Oxford RobotCar, TUM, USyd). For instance, it achieves AR@1 of 85.6% on the +TUM dataset, surpassing the strongest prior model by ~15%. Our code is publicly +available. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ MetaCOG: Learning a Metacognition to Recover What Objects Are Actually + There + + +
+ Humans not only form representations about the world based on what we see, +but also learn meta-cognitive representations about how our own vision works. +This enables us to recognize when our vision is unreliable (e.g., when we +realize that we are experiencing a visual illusion) and enables us to question +what we see. Inspired by this human capacity, we present MetaCOG: a model that +increases the robustness of object detectors by learning representations of +their reliability, and does so without feedback. Specifically, MetaCOG is a +hierarchical probabilistic model that expresses a joint distribution over the +objects in a 3D scene and the outputs produced by a detector. When paired with +an off-the-shelf object detector, MetaCOG takes detections as input and infers +the detector's tendencies to miss objects of certain categories and to +hallucinate objects that are not actually present, all without access to +ground-truth object labels. When paired with three modern neural object +detectors, MetaCOG learns useful and accurate meta-cognitive representations, +resulting in improved performance on the detection task. Additionally, we show +that MetaCOG is robust to varying levels of error in the detections. Our +results are a proof-of-concept for a novel approach to the problem of +correcting a faulty vision system's errors. The model code, datasets, results, +and demos are available: +https://osf.io/8b9qt/?view_only=8c1b1c412c6b4e1697e3c7859be2fce6 + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Robust Long-Tailed Learning via Label-Aware Bounded CVaR + + +
+ Data in the real-world classification problems are always imbalanced or +long-tailed, wherein the majority classes have the most of the samples that +dominate the model training. In such setting, the naive model tends to have +poor performance on the minority classes. Previously, a variety of loss +modifications have been proposed to address the long-tailed leaning problem, +while these methods either treat the samples in the same class +indiscriminatingly or lack a theoretical guarantee. In this paper, we propose +two novel approaches based on CVaR (Conditional Value at Risk) to improve the +performance of long-tailed learning with a solid theoretical ground. +Specifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss +to overcome the pessimistic result of the original CVaR, and further design the +optimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we +additionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to +stabilize the optimization process, where we also offer the theoretical +support. Extensive experiments on real-world datasets with long-tailed label +distributions verify the superiority of our proposed methods. + +
+
+
+
+
+ + ☆ A Multi-Perspective Learning to Rank Approach to Support Children's + Information Seeking in the Classroom + + +
+ We introduce a novel re-ranking model that aims to augment the functionality +of standard search engines to support classroom search activities for children +(ages 6 to 11). This model extends the known listwise learning-to-rank +framework by balancing risk and reward. Doing so enables the model to +prioritize Web resources of high educational alignment, appropriateness, and +adequate readability by analyzing the URLs, snippets, and page titles of Web +resources retrieved by a given mainstream search engine. Experimental results, +including an ablation study and comparisons with existing baselines, showcase +the correctness of the proposed model. The outcomes of this work demonstrate +the value of considering multiple perspectives inherent to the classroom +setting, e.g., educational alignment, readability, and objectionability, when +applied to the design of algorithms that can better support children's +information discovery. + +
+
+ comment: Extended version of the manuscript to appear in proceedings of the + 22nd IEEE/WIC International Conference on Web Intelligence and Intelligent + Agent Technology +
+
+
+
+
+ + ☆ Knowledge-based Multiple Adaptive Spaces Fusion for Recommendation + + +
+ Since Knowledge Graphs (KGs) contain rich semantic information, recently +there has been an influx of KG-enhanced recommendation methods. Most of +existing methods are entirely designed based on euclidean space without +considering curvature. However, recent studies have revealed that a tremendous +graph-structured data exhibits highly non-euclidean properties. Motivated by +these observations, in this work, we propose a knowledge-based multiple +adaptive spaces fusion method for recommendation, namely MCKG. Unlike existing +methods that solely adopt a specific manifold, we introduce the unified space +that is compatible with hyperbolic, euclidean and spherical spaces. +Furthermore, we fuse the multiple unified spaces in an attention manner to +obtain the high-quality embeddings for better knowledge propagation. In +addition, we propose a geometry-aware optimization strategy which enables the +pull and push processes benefited from both hyperbolic and spherical spaces. +Specifically, in hyperbolic space, we set smaller margins in the area near to +the origin, which is conducive to distinguishing between highly similar +positive items and negative ones. At the same time, we set larger margins in +the area far from the origin to ensure the model has sufficient error +tolerance. The similar manner also applies to spherical spaces. Extensive +experiments on three real-world datasets demonstrate that the MCKG has a +significant improvement over state-of-the-art recommendation methods. Further +ablation experiments verify the importance of multi-space fusion and +geometry-aware optimization strategy, justifying the rationality and +effectiveness of MCKG. + +
+
+
+
+
+ + ☆ Classification-Aware Neural Topic Model Combined With Interpretable + Analysis -- For Conflict Classification + + +
+ A large number of conflict events are affecting the world all the time. In +order to analyse such conflict events effectively, this paper presents a +Classification-Aware Neural Topic Model (CANTM-IA) for Conflict Information +Classification and Topic Discovery. The model provides a reliable +interpretation of classification results and discovered topics by introducing +interpretability analysis. At the same time, interpretation is introduced into +the model architecture to improve the classification performance of the model +and to allow interpretation to focus further on the details of the data. +Finally, the model architecture is optimised to reduce the complexity of the +model. + +
+
+ comment: Accepted by RANLP 2023 +
+
+
+
+
+ + ☆ Providing Previously Unseen Users Fair Recommendations Using Variational + Autoencoders RecSys 2023 + + +
+ An emerging definition of fairness in machine learning requires that models +are oblivious to demographic user information, e.g., a user's gender or age +should not influence the model. Personalized recommender systems are +particularly prone to violating this definition through their explicit user +focus and user modelling. Explicit user modelling is also an aspect that makes +many recommender systems incapable of providing hitherto unseen users with +recommendations. We propose novel approaches for mitigating discrimination in +Variational Autoencoder-based recommender systems by limiting the encoding of +demographic information. The approaches are capable of, and evaluated on, +providing users that are not represented in the training data with fair +recommendations. + +
+
+ comment: Appearing in RecSys 2023 proceedings +
+
+
+
+
+ + ☆ CAGRA: Highly Parallel Graph Construction and Approximate Nearest + Neighbor Search for GPUs + + +
+ Approximate Nearest Neighbor Search (ANNS) plays a critical role in various +disciplines spanning data mining and artificial intelligence, from information +retrieval and computer vision to natural language processing and recommender +systems. Data volumes have soared in recent years and the computational cost of +an exhaustive exact nearest neighbor search is often prohibitive, necessitating +the adoption of approximate techniques. The balanced performance and recall of +graph-based approaches have more recently garnered significant attention in +ANNS algorithms, however, only a few studies have explored harnessing the power +of GPUs and multi-core processors despite the widespread use of massively +parallel and general-purpose computing. To bridge this gap, we introduce a +novel parallel computing hardware-based proximity graph and search algorithm. +By leveraging the high-performance capabilities of modern hardware, our +approach achieves remarkable efficiency gains. In particular, our method +surpasses existing CPU and GPU-based methods in constructing the proximity +graph, demonstrating higher throughput in both large- and small-batch searches +while maintaining compatible accuracy. In graph construction time, our method, +CAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA +implementations. In large-batch query throughput in the 90% to 95% recall +range, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the +SOTA implementations for GPU. For a single query, our method is 3.4~53x faster +than HNSW at 95% recall. + +
+
+
+
+
+ + ☆ Killing two birds with one stone: Can an audio captioning system also be + used for audio-text retrieval? + + +
+ Automated Audio Captioning (AAC) aims to develop systems capable of +describing an audio recording using a textual sentence. In contrast, Audio-Text +Retrieval (ATR) systems seek to find the best matching audio recording(s) for a +given textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks +require different types of systems: AAC employs a sequence-to-sequence model, +while ATR utilizes a ranking model that compares audio and text representations +within a shared projection subspace. However, this work investigates the +relationship between AAC and ATR by exploring the ATR capabilities of an +unmodified AAC system, without fine-tuning for the new task. Our AAC system +consists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio +tagging, and a transformer decoder responsible for generating sentences. For +AAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on +AudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss +values obtained for any audio/caption pair. Experimental results on the Clotho +and AudioCaps datasets demonstrate decent recall values using this simple +approach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for +Au-dioCaps, which is above the current state-of-the-art method without external +data. Interestingly, we observe that normalizing the loss values was necessary +for Audio-to-Text retrieval. + +
+
+ comment: cam ready version (14/08/23) +
+
+
+
+
+ + ☆ STEC: See-Through Transformer-based Encoder for CTR Prediction + + +
+ Click-Through Rate (CTR) prediction holds a pivotal place in online +advertising and recommender systems since CTR prediction performance directly +influences the overall satisfaction of the users and the revenue generated by +companies. Even so, CTR prediction is still an active area of research since it +involves accurately modelling the preferences of users based on sparse and +high-dimensional features where the higher-order interactions of multiple +features can lead to different outcomes. Most CTR prediction models have relied +on a single fusion and interaction learning strategy. The few CTR prediction +models that have utilized multiple interaction modelling strategies have +treated each interaction to be self-contained. In this paper, we propose a +novel model named STEC that reaps the benefits of multiple interaction learning +approaches in a single unified architecture. Additionally, our model introduces +residual connections from different orders of interactions which boosts the +performance by allowing lower level interactions to directly affect the +predictions. Through extensive experiments on four real-world datasets, we +demonstrate that STEC outperforms existing state-of-the-art approaches for CTR +prediction thanks to its greater expressive capabilities. + +
+
+
+
+
+ + ☆ Improving Neural Ranking Models with Traditional IR Methods + + +
+ Neural ranking methods based on large transformer models have recently gained +significant attention in the information retrieval community, and have been +adopted by major commercial solutions. Nevertheless, they are computationally +expensive to create, and require a great deal of labeled data for specialized +corpora. In this paper, we explore a low resource alternative which is a +bag-of-embedding model for document retrieval and find that it is competitive +with large transformer models fine tuned on information retrieval tasks. Our +results show that a simple combination of TF-IDF, a traditional keyword +matching method, with a shallow embedding model provides a low cost path to +compete well with the performance of complex neural ranking models on 3 +datasets. Furthermore, adding TF-IDF measures improves the performance of +large-scale fine tuned models on these tasks. + +
+
+ comment: Short paper, 4 pages +
+
+
+
+
+ + ☆ CAPS: A Practical Partition Index for Filtered Similarity Search + + +
+ With the surging popularity of approximate near-neighbor search (ANNS), +driven by advances in neural representation learning, the ability to serve +queries accompanied by a set of constraints has become an area of intense +interest. While the community has recently proposed several algorithms for +constrained ANNS, almost all of these methods focus on integration with +graph-based indexes, the predominant class of algorithms achieving +state-of-the-art performance in latency-recall tradeoffs. In this work, we take +a different approach and focus on developing a constrained ANNS algorithm via +space partitioning as opposed to graphs. To that end, we introduce Constrained +Approximate Partitioned Search (CAPS), an index for ANNS with filters via space +partitions that not only retains the benefits of a partition-based algorithm +but also outperforms state-of-the-art graph-based constrained search techniques +in recall-latency tradeoffs, with only 10% of the index size. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Continual Learning for Generative Retrieval over Dynamic Corpora CIKM 2023 + + +
+ Generative retrieval (GR) directly predicts the identifiers of relevant +documents (i.e., docids) based on a parametric model. It has achieved solid +performance on many ad-hoc retrieval tasks. So far, these tasks have assumed a +static document collection. In many practical scenarios, however, document +collections are dynamic, where new documents are continuously added to the +corpus. The ability to incrementally index new documents while preserving the +ability to answer queries with both previously and newly indexed relevant +documents is vital to applying GR models. In this paper, we address this +practical continual learning problem for GR. We put forward a novel +Continual-LEarner for generatiVE Retrieval (CLEVER) model and make two major +contributions to continual learning for GR: (i) To encode new documents into +docids with low computational cost, we present Incremental Product +Quantization, which updates a partial quantization codebook according to two +adaptive thresholds; and (ii) To memorize new documents for querying without +forgetting previous knowledge, we propose a memory-augmented learning +mechanism, to form meaningful connections between old and new documents. +Empirical results demonstrate the effectiveness and efficiency of the proposed +model. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Vector Search with OpenAI Embeddings: Lucene Is All You Need + + +
+ We provide a reproducible, end-to-end demonstration of vector search with +OpenAI embeddings using Lucene on the popular MS MARCO passage ranking test +collection. The main goal of our work is to challenge the prevailing narrative +that a dedicated vector store is necessary to take advantage of recent advances +in deep neural networks as applied to search. Quite the contrary, we show that +hierarchical navigable small-world network (HNSW) indexes in Lucene are +adequate to provide vector search capabilities in a standard bi-encoder +architecture. This suggests that, from a simple cost-benefit analysis, there +does not appear to be a compelling reason to introduce a dedicated vector store +into a modern "AI stack" for search, since such applications have already +received substantial investments in existing, widely deployed infrastructure. + +
+
+
+
+
+ + ☆ Ensuring User-side Fairness in Dynamic Recommender Systems + + +
+ User-side group fairness is crucial for modern recommender systems, as it +aims to alleviate performance disparity between groups of users defined by +sensitive attributes such as gender, race, or age. We find that the disparity +tends to persist or even increase over time. This calls for effective ways to +address user-side fairness in a dynamic environment, which has been +infrequently explored in the literature. However, fairness-constrained +re-ranking, a typical method to ensure user-side fairness (i.e., reducing +performance disparity), faces two fundamental challenges in the dynamic +setting: (1) non-differentiability of the ranking-based fairness constraint, +which hinders the end-to-end training paradigm, and (2) time-inefficiency, +which impedes quick adaptation to changes in user preferences. In this paper, +we propose FAir Dynamic rEcommender (FADE), an end-to-end framework with +fine-tuning strategy to dynamically alleviate performance disparity. To tackle +the above challenges, FADE uses a novel fairness loss designed to be +differentiable and lightweight to fine-tune model parameters to ensure both +user-side fairness and high-quality recommendations. Via extensive experiments +on the real-world dataset, we empirically demonstrate that FADE effectively and +efficiently reduces performance disparity, and furthermore, FADE improves +overall recommendation quality over time compared to not using any new data. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster + Analysis + + +
+ We introduce usage of a reduction property of penalty-based formulation of +pseudo-Boolean polynomials as a mechanism for invariant dimensionality +reduction in cluster analysis processes. In our experiments, we show that +multidimensional data, like 4-dimensional Iris Flower dataset can be reduced to +2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer +(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or +planes that lie between reduced samples we can extract clusters in a linear and +unbiased manner with competitive accuracies, reproducibility and clear +interpretation. + +
+
+ comment: 14 pages, 4 figures, submitted to the International Conference Data + Analysis, Optimization and Their Applications on the Occasion of Boris + Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region, + Moscow Institute of Physics and Technology + https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php +
+
+
+
+
+ + ☆ Chunked Lists versus Extensible Arrays for Text Inversion + + +
+ In our 2017 work on in-memory list-based text inversion [Hawking and +Billerbeck. Efficient In-Memory, List-Based Text Inversion. ADCS 2017] we +compared memory use and indexing speed of a considerable number of variants of +chunked linked lists. In the present work we compare the best performing of +those variants (FBB - dynamic Fibonacci chunking) with the extensible SQ array +technique (SQA) presented in [Moffat and Mackenzie. Immediate-Access Indexing +Using Space-Efficient Extensible Arrays. ADCS 2023]. + +
+
+ comment: 2 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a meticulously curated, comprehensive +instruction dataset expressly designed for the biomolecular realm. +Mol-Instructions is composed of three pivotal components: molecule-oriented +instructions, protein-oriented instructions, and biomolecular text +instructions, each curated to enhance the understanding and prediction +capabilities of LLMs concerning biomolecular features and behaviors. Through +extensive instruction tuning experiments on the representative LLM, we +underscore the potency of Mol-Instructions to enhance the adaptability and +cognitive acuity of large models within the complex sphere of biomolecular +studies, thereby promoting advancements in the biomolecular research community. +Mol-Instructions is made publicly accessible for future research endeavors and +will be subjected to continual updates for enhanced applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions. Add + quantitative evaluations +
+
+
+
+
+ + ♻ ☆ Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model + + +
+ Sentiment analysis is the process of identifying and categorizing people's +emotions or opinions regarding various topics. The analysis of Twitter +sentiment has become an increasingly popular topic in recent years. In this +paper, we present several machine learning and a deep learning model to +analysis sentiment of Persian political tweets. Our analysis was conducted +using Bag of Words and ParsBERT for word representation. We applied Gaussian +Naive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random +Forests, as well as a combination of CNN and LSTM to classify the polarities of +tweets. The results of this study indicate that deep learning with ParsBERT +embedding performs better than machine learning. The CNN-LSTM model had the +highest classification accuracy with 89 percent on the first dataset and 71 +percent on the second dataset. Due to the complexity of Persian, it was a +difficult task to achieve this level of efficiency. The main objective of our +research was to reduce the training time while maintaining the model's +performance. As a result, several adjustments were made to the model +architecture and parameters. In addition to achieving the objective, the +performance was slightly improved as well. + +
+
+
+
+
+ + ♻ ☆ Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language + Models + + +
+ Dense retrieval (DR) converts queries and documents into dense embeddings and +measures the similarity between queries and documents in vector space. One of +the challenges in DR is the lack of domain-specific training data. While DR +models can learn from large-scale public datasets like MS MARCO through +transfer learning, evidence shows that not all DR models and domains can +benefit from transfer learning equally. Recently, some researchers have +resorted to large language models (LLMs) to improve the zero-shot and few-shot +DR models. However, the hard prompts or human-written prompts utilized in these +works cannot guarantee the good quality of generated weak queries. To tackle +this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task, +we leverage soft prompt-tuning to optimize a task-specific soft prompt on +limited ground truth data and then prompt the LLMs to tag unlabeled documents +with weak queries, yielding enough weak document-query pairs to train +task-specific dense retrievers. We design a filter to select high-quality +example document-query pairs in the prompt to further improve the quality of +weak tagged queries. To the best of our knowledge, there is no prior work +utilizing soft prompt tuning to augment DR models. The experiments demonstrate +that SPTAR outperforms the unsupervised baselines BM25 and the recently +proposed LLMs-based augmentation method for DR. + +
+
+ comment: fix typos +
+
+
+
+
+ + ♻ ☆ Dual-Granularity Contrastive Learning for Session-based Recommendation + + +
+ Session-based recommendation systems(SBRS) are more suitable for the current +e-commerce and streaming media recommendation scenarios and thus have become a +hot topic. The data encountered by SBRS is typically highly sparse, which also +serves as one of the bottlenecks limiting the accuracy of recommendations. So +Contrastive Learning(CL) is applied in SBRS owing to its capability of +improving embedding learning under the condition of sparse data. However, +existing CL strategies are limited in their ability to enforce finer-grained +(e.g., factor-level) comparisons and, as a result, are unable to capture subtle +differences between instances. More than that, these strategies usually use +item or segment dropout as a means of data augmentation which may result in +sparser data and thus ineffective self-supervised signals. By addressing the +two aforementioned limitations, we introduce a novel multi-granularity CL +framework. Specifically, two extra augmented embedding convolution channels +with different granularities are constructed and the embeddings learned by them +are compared with those learned from original view to complete the CL tasks. At +factor-level, we employ Disentangled Representation Learning to obtain +finer-grained data(e.g. factor-level embeddings), with which we can construct +factor-level convolution channels. At item-level, the star graph is deployed as +the augmented data and graph convolution on it can ensure the effectiveness of +self-supervised signals. Compare the learned embeddings of these two views with +the learned embeddings of the basic view to achieve CL at two granularities. +Finally, the more precise item-level and factor-level embeddings obtained are +referenced to generate personalized recommendations for the user. The proposed +model is validated through extensive experiments on two benchmark datasets, +showcasing superior performance compared to existing methods. + +
+
+
+
+
+ + ♻ ☆ RecXplainer: Amortized Attribute-based Personalized Explanations for + Recommender Systems NeurIPS 2022 + + +
+ Recommender systems influence many of our interactions in the digital world +-- impacting how we shop for clothes, sorting what we see when browsing YouTube +or TikTok, and determining which restaurants and hotels we are shown when using +hospitality platforms. Modern recommender systems are large, opaque models +trained on a mixture of proprietary and open-source datasets. Naturally, issues +of trust arise on both the developer and user side: is the system working +correctly, and why did a user receive (or not receive) a particular +recommendation? Providing an explanation alongside a recommendation alleviates +some of these concerns. The status quo for auxiliary recommender system +feedback is either user-specific explanations (e.g., "users who bought item B +also bought item A") or item-specific explanations (e.g., "we are recommending +item A because you watched/bought item B"). However, users bring personalized +context into their search experience, valuing an item as a function of that +item's attributes and their own personal preferences. In this work, we propose +RecXplainer, a novel method for generating fine-grained explanations based on a +user's preferences over the attributes of recommended items. We evaluate +RecXplainer on five real-world and large-scale recommendation datasets using +five different kinds of recommender systems to demonstrate the efficacy of +RecXplainer in capturing users' preferences over item attributes and using them +to explain recommendations. We also compare RecXplainer to five baselines and +show RecXplainer's exceptional performance on ten metrics. + +
+
+ comment: Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022 +
+
+
+
+
+
+
+
+ + Machine Learning 157 + +
+
+
+ + ☆ 3D Adversarial Augmentations for Robust Out-of-Domain Predictions + + +
+ Since real-world training datasets cannot properly sample the long tail of +the underlying data distribution, corner cases and rare out-of-domain samples +can severely hinder the performance of state-of-the-art models. This problem +becomes even more severe for dense tasks, such as 3D semantic segmentation, +where points of non-standard objects can be confidently associated to the wrong +class. In this work, we focus on improving the generalization to out-of-domain +data. We achieve this by augmenting the training set with adversarial examples. +First, we learn a set of vectors that deform the objects in an adversarial +fashion. To prevent the adversarial examples from being too far from the +existing data distribution, we preserve their plausibility through a series of +constraints, ensuring sensor-awareness and shapes smoothness. Then, we perform +adversarial augmentation by applying the learned sample-independent vectors to +the available objects when training a model. We conduct extensive experiments +across a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D +object detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D +semantic segmentation. Despite training on a standard single dataset, our +approach substantially improves the robustness and generalization of both 3D +object detection and 3D semantic segmentation methods to out-of-domain data. + +
+
+ comment: 37 pages, 12 figures +
+
+
+
+
+ + ☆ An Adaptive Tangent Feature Perspective of Neural Networks + + +
+ In order to better understand feature learning in neural networks, we propose +a framework for understanding linear models in tangent feature space where the +features are allowed to be transformed during training. We consider linear +transformations of features, resulting in a joint optimization over parameters +and transformations with a bilinear interpolation constraint. We show that this +optimization problem has an equivalent linearly constrained optimization with +structured regularization that encourages approximately low rank solutions. +Specializing to neural network structure, we gain insights into how the +features and thus the kernel function change, providing additional nuance to +the phenomenon of kernel alignment when the target function is poorly +represented using tangent features. In addition to verifying our theoretical +observations in real neural networks on a simple regression problem, we +empirically show that an adaptive feature implementation of tangent feature +classification has an order of magnitude lower sample complexity than the fixed +tangent feature model on MNIST and CIFAR-10. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Policy composition in reinforcement learning via multi-objective policy + optimization + + +
+ We enable reinforcement learning agents to learn successful behavior policies +by utilizing relevant pre-existing teacher policies. The teacher policies are +introduced as objectives, in addition to the task objective, in a +multi-objective policy optimization setting. Using the Multi-Objective Maximum +a Posteriori Policy Optimization algorithm +\citep{abdolmaleki2020distributional}, we show that teacher policies can help +speed up learning, particularly in the absence of shaping rewards. In two +domains with continuous observation and action spaces, our agents successfully +compose teacher policies in sequence and in parallel, and are also able to +further extend the policies of the teachers in order to solve the task. + Depending on the specified combination of task and teacher(s), teacher(s) may +naturally act to limit the final performance of an agent. The extent to which +agents are required to adhere to teacher policies are determined by +hyperparameters which determine both the effect of teachers on learning speed +and the eventual performance of the agent on the task. In the {\tt humanoid} +domain \citep{deepmindcontrolsuite2018}, we also equip agents with the ability +to control the selection of teachers. With this ability, agents are able to +meaningfully compose from the teacher policies to achieve a superior task +reward on the {\tt walk} task than in cases without access to the teacher +policies. We show the resemblance of composed task policies with the +corresponding teacher policies through videos. + +
+
+
+
+
+ + ☆ Input margins can predict generalization too + + +
+ Understanding generalization in deep neural networks is an active area of +research. A promising avenue of exploration has been that of margin +measurements: the shortest distance to the decision boundary for a given sample +or its representation internal to the network. While margins have been shown to +be correlated with the generalization ability of a model when measured at its +hidden representations (hidden margins), no such link between large margins and +generalization has been established for input margins. We show that while input +margins are not generally predictive of generalization, they can be if the +search space is appropriately constrained. We develop such a measure based on +input margins, which we refer to as `constrained margins'. The predictive power +of this new measure is demonstrated on the 'Predicting Generalization in Deep +Learning' (PGDL) dataset and contrasted with hidden representation margins. We +find that constrained margins achieve highly competitive scores and outperform +other margin measurements in general. This provides a novel insight on the +relationship between generalization and classification margins, and highlights +the importance of considering the data manifold for investigations of +generalization in DNNs. + +
+
+
+
+
+ + ☆ A Comparative Study of Loss Functions: Traffic Predictions in Regular + and Congestion Scenarios + + +
+ Spatiotemporal graph neural networks have achieved state-of-the-art +performance in traffic forecasting. However, they often struggle to forecast +congestion accurately due to the limitations of traditional loss functions. +While accurate forecasting of regular traffic conditions is crucial, a reliable +AI system must also accurately forecast congestion scenarios to maintain safe +and efficient transportation. In this paper, we explore various loss functions +inspired by heavy tail analysis and imbalanced classification problems to +address this issue. We evaluate the efficacy of these loss functions in +forecasting traffic speed, with an emphasis on congestion scenarios. Through +extensive experiments on real-world traffic datasets, we discovered that when +optimizing for Mean Absolute Error (MAE), the MAE-Focal Loss function stands +out as the most effective. When optimizing Mean Squared Error (MSE), Gumbel +Loss proves to be the superior choice. These choices effectively forecast +traffic congestion events without compromising the accuracy of regular traffic +speed forecasts. This research enhances deep learning models' capabilities in +forecasting sudden speed changes due to congestion and underscores the need for +more research in this direction. By elevating the accuracy of congestion +forecasting, we advocate for AI systems that are reliable, secure, and +resilient in practical traffic management scenarios. + +
+
+
+
+
+ + ☆ Canonical Factors for Hybrid Neural Fields ICCV 2023 + + +
+ Factored feature volumes offer a simple way to build more compact, efficient, +and intepretable neural fields, but also introduce biases that are not +necessarily beneficial for real-world data. In this work, we (1) characterize +the undesirable biases that these architectures have for axis-aligned signals +-- they can lead to radiance field reconstruction differences of as high as 2 +PSNR -- and (2) explore how learning a set of canonicalizing transformations +can improve representations by removing these biases. We prove in a +two-dimensional model problem that simultaneously learning these +transformations together with scene appearance succeeds with drastically +improved efficiency. We validate the resulting architectures, which we call +TILTED, using image, signed distance, and radiance field reconstruction tasks, +where we observe improvements across quality, robustness, compactness, and +runtime. Results demonstrate that TILTED can enable capabilities comparable to +baselines that are 2x larger, while highlighting weaknesses of neural field +evaluation procedures. + +
+
+ comment: ICCV 2023. Project webpage: https://brentyi.github.io/tilted/ +
+
+
+
+
+ + ☆ From SMOTE to Mixup for Deep Imbalanced Classification + + +
+ Given imbalanced data, it is hard to train a good classifier using deep +learning because of the poor generalization of minority classes. Traditionally, +the well-known synthetic minority oversampling technique (SMOTE) for data +augmentation, a data mining approach for imbalanced learning, has been used to +improve this generalization. However, it is unclear whether SMOTE also benefits +deep learning. In this work, we study why the original SMOTE is insufficient +for deep learning, and enhance SMOTE using soft labels. Connecting the +resulting soft SMOTE with Mixup, a modern data augmentation technique, leads to +a unified framework that puts traditional and modern data augmentation +techniques under the same umbrella. A careful study within this framework shows +that Mixup improves generalization by implicitly achieving uneven margins +between majority and minority classes. We then propose a novel margin-aware +Mixup technique that more explicitly achieves uneven margins. Extensive +experimental results demonstrate that our proposed technique yields +state-of-the-art performance on deep imbalanced classification while achieving +superior performance on extremely imbalanced data. The code is open-sourced in +our developed package https://github.com/ntucllab/imbalanced-DL to foster +future research in this direction. + +
+
+ comment: 25 pages, 3 figures +
+
+
+
+
+ + ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ The reasoning capabilities of Large Language Models (LLMs) play a pivotal +role in the realm of embodied artificial intelligence. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Random feature approximation for general spectral methods + + +
+ Random feature approximation is arguably one of the most popular techniques +to speed up kernel methods in large scale algorithms and provides a theoretical +approach to the analysis of deep neural networks. We analyze generalization +properties for a large class of spectral regularization methods combined with +random features, containing kernel methods with implicit regularization such as +gradient descent or explicit methods like Tikhonov regularization. For our +estimators we obtain optimal learning rates over regularity classes (even for +classes that are not included in the reproducing kernel Hilbert space), which +are defined through appropriate source conditions. This improves or completes +previous results obtained in related settings for specific kernel algorithms. + +
+
+
+
+
+ + ☆ Probabilistic solar flare forecasting using historical magnetogram data + + +
+ Solar flare forecasting research using machine learning (ML) has focused on +high resolution magnetogram data from the SDO/HMI era covering Solar Cycle 24 +and the start of Solar Cycle 25, with some efforts looking back to SOHO/MDI for +data from Solar Cycle 23. In this paper, we consider over 4 solar cycles of +daily historical magnetogram data from multiple instruments. This is the first +attempt to take advantage of this historical data for ML-based flare +forecasting. We apply a convolutional neural network (CNN) to extract features +from full-disk magnetograms together with a logistic regression model to +incorporate scalar features based on magnetograms and flaring history. We use +an ensemble approach to generate calibrated probabilistic forecasts of M-class +or larger flares in the next 24 hours. Overall, we find that including +historical data improves forecasting skill and reliability. We show that single +frame magnetograms do not contain significantly more relevant information than +can be summarized in a small number of scalar features, and that flaring +history has greater predictive power than our CNN-extracted features. This +indicates the importance of including temporal information in flare forecasting +models. + +
+
+ comment: 22 pages, 16 figures, accepted to ApJ +
+
+
+
+
+ + ☆ Robust Long-Tailed Learning via Label-Aware Bounded CVaR + + +
+ Data in the real-world classification problems are always imbalanced or +long-tailed, wherein the majority classes have the most of the samples that +dominate the model training. In such setting, the naive model tends to have +poor performance on the minority classes. Previously, a variety of loss +modifications have been proposed to address the long-tailed leaning problem, +while these methods either treat the samples in the same class +indiscriminatingly or lack a theoretical guarantee. In this paper, we propose +two novel approaches based on CVaR (Conditional Value at Risk) to improve the +performance of long-tailed learning with a solid theoretical ground. +Specifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss +to overcome the pessimistic result of the original CVaR, and further design the +optimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we +additionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to +stabilize the optimization process, where we also offer the theoretical +support. Extensive experiments on real-world datasets with long-tailed label +distributions verify the superiority of our proposed methods. + +
+
+
+
+
+ + ☆ The CausalBench challenge: A machine learning contest for gene network + inference from single-cell perturbation data + + +
+ In drug discovery, mapping interactions between genes within cellular systems +is a crucial early step. This helps formulate hypotheses regarding molecular +mechanisms that could potentially be targeted by future medicines. The +CausalBench Challenge was an initiative to invite the machine learning +community to advance the state of the art in constructing gene-gene interaction +networks. These networks, derived from large-scale, real-world datasets of +single cells under various perturbations, are crucial for understanding the +causal mechanisms underlying disease biology. Using the framework provided by +the CausalBench benchmark, participants were tasked with enhancing the capacity +of the state of the art methods to leverage large-scale genetic perturbation +data. This report provides an analysis and summary of the methods submitted +during the challenge to give a partial image of the state of the art at the +time of the challenge. The winning solutions significantly improved performance +compared to previous baselines, establishing a new state of the art for this +critical task in biology and medicine. + +
+
+
+
+
+ + ☆ Decentralized Multi-agent Reinforcement Learning based State-of-Charge + Balancing Strategy for Distributed Energy Storage System + + +
+ This paper develops a Decentralized Multi-Agent Reinforcement Learning +(Dec-MARL) method to solve the SoC balancing problem in the distributed energy +storage system (DESS). First, the SoC balancing problem is formulated into a +finite Markov decision process with action constraints derived from demand +balance, which can be solved by Dec-MARL. Specifically, the first-order average +consensus algorithm is utilized to expand the observations of the DESS state in +a fully-decentralized way, and the initial actions (i.e., output power) are +decided by the agents (i.e., energy storage units) according to these +observations. In order to get the final actions in the allowable range, a +counterfactual demand balance algorithm is proposed to balance the total demand +and the initial actions. Next, the agents execute the final actions and get +local rewards from the environment, and the DESS steps into the next state. +Finally, through the first-order average consensus algorithm, the agents get +the average reward and the expended observation of the next state for later +training. By the above procedure, Dec-MARL reveals outstanding performance in a +fully-decentralized system without any expert experience or constructing any +complicated model. Besides, it is flexible and can be extended to other +decentralized multi-agent systems straightforwardly. Extensive simulations have +validated the effectiveness and efficiency of Dec-MARL. + +
+
+
+
+
+ + ☆ Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation + and Diagnosis + + +
+ Thyroid nodule segmentation is a crucial step in the diagnostic procedure of +physicians and computer-aided diagnosis systems. Mostly, current studies treat +segmentation and diagnosis as independent tasks without considering the +correlation between these tasks. The sequence steps of these independent tasks +in computer-aided diagnosis systems may lead to the accumulation of errors. +Therefore, it is worth combining them as a whole through exploring the +relationship between thyroid nodule segmentation and diagnosis. According to +the thyroid imaging reporting and data system (TI-RADS), the assessment of +shape and margin characteristics is the prerequisite for the discrimination of +benign and malignant thyroid nodules. These characteristics can be observed in +the thyroid nodule segmentation masks. Inspired by the diagnostic procedure of +TI-RADS, this paper proposes a shape-margin knowledge augmented network +(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to +the similarity in visual features between segmentation and diagnosis, SkaNet +shares visual features in the feature extraction stage and then utilizes a +dual-branch architecture to perform thyroid nodule segmentation and diagnosis +tasks simultaneously. To enhance effective discriminative features, an +exponential mixture module is devised, which incorporates convolutional feature +maps and self-attention maps by exponential weighting. Then, SkaNet is jointly +optimized by a knowledge augmented multi-task loss function with a constraint +penalty term. It embeds shape and margin characteristics through numerical +computation and models the relationship between the thyroid nodule diagnosis +results and segmentation masks. + +
+
+
+
+
+ + ☆ Multi-Response Heteroscedastic Gaussian Process Models and Their + Inference + + +
+ Despite the widespread utilization of Gaussian process models for versatile +nonparametric modeling, they exhibit limitations in effectively capturing +abrupt changes in function smoothness and accommodating relationships with +heteroscedastic errors. Addressing these shortcomings, the heteroscedastic +Gaussian process (HeGP) regression seeks to introduce flexibility by +acknowledging the variability of residual variances across covariates in the +regression model. In this work, we extend the HeGP concept, expanding its scope +beyond regression tasks to encompass classification and state-space models. To +achieve this, we propose a novel framework where the Gaussian process is +coupled with a covariate-induced precision matrix process, adopting a mixture +formulation. This approach enables the modeling of heteroscedastic covariance +functions across covariates. To mitigate the computational challenges posed by +sampling, we employ variational inference to approximate the posterior and +facilitate posterior predictive modeling. Additionally, our training process +leverages an EM algorithm featuring closed-form M-step updates to efficiently +evaluate the heteroscedastic covariance function. A notable feature of our +model is its consistent performance on multivariate responses, accommodating +various types (continuous or categorical) seamlessly. Through a combination of +simulations and real-world applications in climatology, we illustrate the +model's prowess and advantages. By overcoming the limitations of traditional +Gaussian process models, our proposed framework offers a robust and versatile +tool for a wide array of applications. + +
+
+ comment: submitted to the Journal of the American Statistical Association + (JASA) +
+
+
+
+
+ + ☆ Efficient Model Personalization in Federated Learning via + Client-Specific Prompt Generation ICCV 2023 + + +
+ Federated learning (FL) emerges as a decentralized learning framework which +trains models from multiple distributed clients without sharing their data to +preserve privacy. Recently, large-scale pre-trained models (e.g., Vision +Transformer) have shown a strong capability of deriving robust representations. +However, the data heterogeneity among clients, the limited computation +resources, and the communication bandwidth restrict the deployment of +large-scale models in FL frameworks. To leverage robust representations from +large-scale models while enabling efficient model personalization for +heterogeneous clients, we propose a novel personalized FL framework of +client-specific Prompt Generation (pFedPG), which learns to deploy a +personalized prompt generator at the server for producing client-specific +visual prompts that efficiently adapts frozen backbones to local data +distributions. Our proposed framework jointly optimizes the stages of +personalized prompt adaptation locally and personalized prompt generation +globally. The former aims to train visual prompts that adapt foundation models +to each client, while the latter observes local optimization directions to +generate personalized prompts for all clients. Through extensive experiments on +benchmark datasets, we show that our pFedPG is favorable against +state-of-the-art personalized FL methods under various types of data +heterogeneity, allowing computation and communication efficient model +personalization. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Heterogeneous Multi-Task Gaussian Cox Processes + + +
+ This paper presents a novel extension of multi-task Gaussian Cox processes +for modeling multiple heterogeneous correlated tasks jointly, e.g., +classification and regression, via multi-output Gaussian processes (MOGP). A +MOGP prior over the parameters of the dedicated likelihoods for classification, +regression and point process tasks can facilitate sharing of information +between heterogeneous tasks, while allowing for nonparametric parameter +estimation. To circumvent the non-conjugate Bayesian inference in the MOGP +modulated heterogeneous multi-task framework, we employ the data augmentation +technique and derive a mean-field approximation to realize closed-form +iterative updates for estimating model parameters. We demonstrate the +performance and inference on both 1D synthetic data as well as 2D urban data of +Vancouver. + +
+
+
+
+
+ + ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborates their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. +Towards an efficient and economic LLM-based Text-to-SQL solution, we emphasize +the token efficiency in prompt engineering and compare the prior studies under +this metric. Additionally, we investigate open-source LLMs in in-context +learning, and further enhance their performance with task-specific supervised +fine-tuning. Our explorations highlight open-source LLMs' potential in +Text-to-SQL, as well as the advantages and disadvantages of the task-specific +supervised fine-tuning. We hope that our work provides a deeper understanding +of Text-to-SQL with LLMs, and inspire further investigations and broad +applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ☆ Lie-Poisson Neural Networks (LPNets): Data-Based Computing of + Hamiltonian Systems with Symmetries + + +
+ An accurate data-based prediction of the long-term evolution of Hamiltonian +systems requires a network that preserves the appropriate structure under each +time step. Every Hamiltonian system contains two essential ingredients: the +Poisson bracket and the Hamiltonian. Hamiltonian systems with symmetries, whose +paradigm examples are the Lie-Poisson systems, have been shown to describe a +broad category of physical phenomena, from satellite motion to underwater +vehicles, fluids, geophysical applications, complex fluids, and plasma physics. +The Poisson bracket in these systems comes from the symmetries, while the +Hamiltonian comes from the underlying physics. We view the symmetry of the +system as primary, hence the Lie-Poisson bracket is known exactly, whereas the +Hamiltonian is regarded as coming from physics and is considered not known, or +known approximately. Using this approach, we develop a network based on +transformations that exactly preserve the Poisson bracket and the special +functions of the Lie-Poisson systems (Casimirs) to machine precision. We +present two flavors of such systems: one, where the parameters of +transformations are computed from data using a dense neural network (LPNets), +and another, where the composition of transformations is used as building +blocks (G-LPNets). We also show how to adapt these methods to a larger class of +Poisson brackets. We apply the resulting methods to several examples, such as +rigid body (satellite) motion, underwater vehicles, a particle in a magnetic +field, and others. The methods developed in this paper are important for the +construction of accurate data-based methods for simulating the long-term +dynamics of physical systems. + +
+
+ comment: 57 pages, 13 figures +
+
+
+
+
+ + ☆ Imperceptible Adversarial Attack on Deep Neural Networks from Image + Boundary + + +
+ Although Deep Neural Networks (DNNs), such as the convolutional neural +networks (CNN) and Vision Transformers (ViTs), have been successfully applied +in the field of computer vision, they are demonstrated to be vulnerable to +well-sought Adversarial Examples (AEs) that can easily fool the DNNs. The +research in AEs has been active, and many adversarial attacks and explanations +have been proposed since they were discovered in 2014. The mystery of the AE's +existence is still an open question, and many studies suggest that DNN training +algorithms have blind spots. The salient objects usually do not overlap with +boundaries; hence, the boundaries are not the DNN model's attention. +Nevertheless, recent studies show that the boundaries can dominate the behavior +of the DNN models. Hence, this study aims to look at the AEs from a different +perspective and proposes an imperceptible adversarial attack that systemically +attacks the input image boundary for finding the AEs. The experimental results +have shown that the proposed boundary attacking method effectively attacks six +CNN models and the ViT using only 32% of the input image content (from the +boundaries) with an average success rate (SR) of 95.2% and an average peak +signal-to-noise ratio of 41.37 dB. Correlation analyses are conducted, +including the relation between the adversarial boundary's width and the SR and +how the adversarial boundary changes the DNN model's attention. This paper's +discoveries can potentially advance the understanding of AEs and provide a +different perspective on how AEs can be constructed. + +
+
+
+
+
+ + ☆ Enhancing Robot Learning through Learned Human-Attention Feature Maps ICRA 2023 + + +
+ Robust and efficient learning remains a challenging problem in robotics, in +particular with complex visual inputs. Inspired by human attention mechanism, +with which we quickly process complex visual scenes and react to changes in the +environment, we think that embedding auxiliary information about focus point +into robot learning would enhance efficiency and robustness of the learning +process. In this paper, we propose a novel approach to model and emulate the +human attention with an approximate prediction model. We then leverage this +output and feed it as a structured auxiliary feature map into downstream +learning tasks. We validate this idea by learning a prediction model from +human-gaze recordings of manual driving in the real world. We test our approach +on two learning tasks - object detection and imitation learning. Our +experiments demonstrate that the inclusion of predicted human attention leads +to improved robustness of the trained models to out-of-distribution samples and +faster learning in low-data regime settings. Our work highlights the potential +of incorporating structured auxiliary information in representation learning +for robotics and opens up new avenues for research in this direction. All code +and data are available online. + +
+
+ comment: This work has been accepted for the RAP4Robots workshop at ICRA 2023 + in London +
+
+
+
+
+ + ☆ Occlusion-Aware Deep Convolutional Neural Network via Homogeneous + Tanh-transforms for Face Parsing + + +
+ Face parsing infers a pixel-wise label map for each semantic facial +component. Previous methods generally work well for uncovered faces, however +overlook the facial occlusion and ignore some contextual area outside a single +face, especially when facial occlusion has become a common situation during the +COVID-19 epidemic. Inspired by the illumination theory of image, we propose a +novel homogeneous tanh-transforms for image preprocessing, which made up of +four tanh-transforms, that fuse the central vision and the peripheral vision +together. Our proposed method addresses the dilemma of face parsing under +occlusion and compresses more information of surrounding context. Based on +homogeneous tanh-transforms, we propose an occlusion-aware convolutional neural +network for occluded face parsing. It combines the information both in +Tanh-polar space and Tanh-Cartesian space, capable of enhancing receptive +fields. Furthermore, we introduce an occlusion-aware loss to focus on the +boundaries of occluded regions. The network is simple and flexible, and can be +trained end-to-end. To facilitate future research of occluded face parsing, we +also contribute a new cleaned face parsing dataset, which is manually purified +from several academic or industrial datasets, including CelebAMask-HQ, +Short-video Face Parsing as well as Helen dataset and will make it public. +Experiments demonstrate that our method surpasses state-of-art methods of face +parsing under occlusion. + +
+
+
+
+
+ + ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ☆ On-Device Learning with Binary Neural Networks + + +
+ Existing Continual Learning (CL) solutions only partially address the +constraints on power, memory and computation of the deep learning models when +deployed on low-power embedded CPUs. In this paper, we propose a CL solution +that embraces the recent advancements in CL field and the efficiency of the +Binary Neural Networks (BNN), that use 1-bit for weights and activations to +efficiently execute deep learning models. We propose a hybrid quantization of +CWR* (an effective CL approach) that considers differently forward and backward +pass in order to retain more precision during gradient update step and at the +same time minimizing the latency overhead. The choice of a binary network as +backbone is essential to meet the constraints of low power devices and, to the +best of authors' knowledge, this is the first attempt to prove on-device +learning with BNN. The experimental validation carried out confirms the +validity and the suitability of the proposed method. + +
+
+
+
+
+ + ☆ Towards quantitative precision for ECG analysis: Leveraging state space + models, self-supervision and patient metadata + + +
+ Deep learning has emerged as the preferred modeling approach for automatic +ECG analysis. In this study, we investigate three elements aimed at improving +the quantitative accuracy of such systems. These components consistently +enhance performance beyond the existing state-of-the-art, which is +predominantly based on convolutional models. Firstly, we explore more +expressive architectures by exploiting structured state space models (SSMs). +These models have shown promise in capturing long-term dependencies in time +series data. By incorporating SSMs into our approach, we not only achieve +better performance, but also gain insights into long-standing questions in the +field. Specifically, for standard diagnostic tasks, we find no advantage in +using higher sampling rates such as 500Hz compared to 100Hz. Similarly, +extending the input size of the model beyond 3 seconds does not lead to +significant improvements. Secondly, we demonstrate that self-supervised +learning using contrastive predictive coding can further improve the +performance of SSMs. By leveraging self-supervision, we enable the model to +learn more robust and representative features, leading to improved analysis +accuracy. Lastly, we depart from synthetic benchmarking scenarios and +incorporate basic demographic metadata alongside the ECG signal as input. This +inclusion of patient metadata departs from the conventional practice of relying +solely on the signal itself. Remarkably, this addition consistently yields +positive effects on predictive performance. We firmly believe that all three +components should be considered when developing next-generation ECG analysis +algorithms. + +
+
+ comment: extended version of arXiv:2211.07579 +
+
+
+
+
+ + ☆ Structural Node Embeddings with Homomorphism Counts + + +
+ Graph homomorphism counts, first explored by Lov\'asz in 1967, have recently +garnered interest as a powerful tool in graph-based machine learning. Grohe +(PODS 2020) proposed the theoretical foundations for using homomorphism counts +in machine learning on graph level as well as node level tasks. By their very +nature, these capture local structural information, which enables the creation +of robust structural embeddings. While a first approach for graph level tasks +has been made by Nguyen and Maehara (ICML 2020), we experimentally show the +effectiveness of homomorphism count based node embeddings. Enriched with node +labels, node weights, and edge weights, these offer an interpretable +representation of graph data, allowing for enhanced explainability of machine +learning models. + We propose a theoretical framework for isomorphism-invariant homomorphism +count based embeddings which lend themselves to a wide variety of downstream +tasks. Our approach capitalises on the efficient computability of graph +homomorphism counts for bounded treewidth graph classes, rendering it a +practical solution for real-world applications. We demonstrate their +expressivity through experiments on benchmark datasets. Although our results do +not match the accuracy of state-of-the-art neural architectures, they are +comparable to other advanced graph learning models. Remarkably, our approach +demarcates itself by ensuring explainability for each individual feature. By +integrating interpretable machine learning algorithms like SVMs or Random +Forests, we establish a seamless, end-to-end explainable pipeline. Our study +contributes to the advancement of graph-based techniques that offer both +performance and interpretability. + +
+
+
+
+
+ + ☆ Let There Be Sound: Reconstructing High Quality Speech from Silent + Videos + + +
+ The goal of this work is to reconstruct high quality speech from lip motions +alone, a task also known as lip-to-speech. A key challenge of lip-to-speech +systems is the one-to-many mapping caused by (1) the existence of homophenes +and (2) multiple speech variations, resulting in a mispronounced and +over-smoothed speech. In this paper, we propose a novel lip-to-speech system +that significantly improves the generation quality by alleviating the +one-to-many mapping problem from multiple perspectives. Specifically, we +incorporate (1) self-supervised speech representations to disambiguate +homophenes, and (2) acoustic variance information to model diverse speech +styles. Additionally, to better solve the aforementioned problem, we employ a +flow based post-net which captures and refines the details of the generated +speech. We perform extensive experiments and demonstrate that our method +achieves the generation quality close to that of real human utterance, +outperforming existing methods in terms of speech naturalness and +intelligibility by a large margin. Synthesised samples are available at the +anonymous demo page: https://mm.kaist.ac.kr/projects/LTBS. + +
+
+ comment: 10 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ The Relative Gaussian Mechanism and its Application to Private Gradient + Descent + + +
+ The Gaussian Mechanism (GM), which consists in adding Gaussian noise to a +vector-valued query before releasing it, is a standard privacy protection +mechanism. In particular, given that the query respects some L2 sensitivity +property (the L2 distance between outputs on any two neighboring inputs is +bounded), GM guarantees R\'enyi Differential Privacy (RDP). Unfortunately, +precisely bounding the L2 sensitivity can be hard, thus leading to loose +privacy bounds. In this work, we consider a Relative L2 sensitivity assumption, +in which the bound on the distance between two query outputs may also depend on +their norm. Leveraging this assumption, we introduce the Relative Gaussian +Mechanism (RGM), in which the variance of the noise depends on the norm of the +output. We prove tight bounds on the RDP parameters under relative L2 +sensitivity, and characterize the privacy loss incurred by using +output-dependent noise. In particular, we show that RGM naturally adapts to a +latent variable that would control the norm of the output. Finally, we +instantiate our framework to show tight guarantees for Private Gradient +Descent, a problem that naturally fits our relative L2 sensitivity assumption. + +
+
+
+
+
+ + ☆ Reliability Gaps Between Groups in COMPAS Dataset + + +
+ This paper investigates the inter-rater reliability of risk assessment +instruments (RAIs). The main question is whether different, socially salient +groups are affected differently by a lack of inter-rater reliability of RAIs, +that is, whether mistakes with respect to different groups affects them +differently. The question is investigated with a simulation study of the COMPAS +dataset. A controlled degree of noise is injected into the input data of a +predictive model; the noise can be interpreted as a synthetic rater that makes +mistakes. The main finding is that there are systematic differences in output +reliability between groups in the COMPAS dataset. The sign of the difference +depends on the kind of inter-rater statistic that is used (Cohen's Kappa, +Byrt's PABAK, ICC), and in particular whether or not a correction of +predictions prevalences of the groups is used. + +
+
+ comment: 15 pages + appendix +
+
+
+
+
+ + ☆ Assessing Cyclostationary Malware Detection via Feature Selection and + Classification + + +
+ Cyclostationarity involves periodic statistical variations in signals and +processes, commonly used in signal analysis and network security. In the +context of attacks, cyclostationarity helps detect malicious behaviors within +network traffic, such as traffic patterns in Distributed Denial of Service +(DDoS) attacks or hidden communication channels in malware. This approach +enhances security by identifying abnormal patterns and informing Network +Intrusion Detection Systems (NIDSs) to recognize potential attacks, enhancing +protection against both known and novel threats. This research focuses on +identifying cyclostationary malware behavior and its detection. The main goal +is to pinpoint essential cyclostationary features used in NIDSs. These features +are extracted using algorithms such as Boruta and Principal Component Analysis +(PCA), and then categorized to find the most significant cyclostationary +patterns. The aim of this article is to reveal periodically changing malware +behaviors through cyclostationarity. The study highlights the importance of +spotting cyclostationary malware in NIDSs by using established datasets like +KDD99, NSL-KDD, and the UGRansome dataset. The UGRansome dataset is designed +for anomaly detection research and includes both normal and abnormal network +threat categories of zero-day attacks. A comparison is made using the Random +Forest (RF) and Support Vector Machine (SVM) algorithms, while also evaluating +the effectiveness of Boruta and PCA. The findings show that PCA is more +promising than using Boruta alone for extracting cyclostationary network +feature patterns. Additionally, the analysis identifies the internet protocol +as the most noticeable cyclostationary feature pattern used by malware. +Notably, the UGRansome dataset outperforms the KDD99 and NSL-KDD, achieving 99% +accuracy in signature malware detection using the RF algorithm and 98% with the +SVM. + +
+
+
+
+
+ + ☆ Classification-Aware Neural Topic Model Combined With Interpretable + Analysis -- For Conflict Classification + + +
+ A large number of conflict events are affecting the world all the time. In +order to analyse such conflict events effectively, this paper presents a +Classification-Aware Neural Topic Model (CANTM-IA) for Conflict Information +Classification and Topic Discovery. The model provides a reliable +interpretation of classification results and discovered topics by introducing +interpretability analysis. At the same time, interpretation is introduced into +the model architecture to improve the classification performance of the model +and to allow interpretation to focus further on the details of the data. +Finally, the model architecture is optimised to reduce the complexity of the +model. + +
+
+ comment: Accepted by RANLP 2023 +
+
+
+
+
+ + ☆ Providing Previously Unseen Users Fair Recommendations Using Variational + Autoencoders RecSys 2023 + + +
+ An emerging definition of fairness in machine learning requires that models +are oblivious to demographic user information, e.g., a user's gender or age +should not influence the model. Personalized recommender systems are +particularly prone to violating this definition through their explicit user +focus and user modelling. Explicit user modelling is also an aspect that makes +many recommender systems incapable of providing hitherto unseen users with +recommendations. We propose novel approaches for mitigating discrimination in +Variational Autoencoder-based recommender systems by limiting the encoding of +demographic information. The approaches are capable of, and evaluated on, +providing users that are not represented in the training data with fair +recommendations. + +
+
+ comment: Appearing in RecSys 2023 proceedings +
+
+
+
+
+ + ☆ Evaluating Explanation Methods for Multivariate Time Series + Classification ALT + + +
+ Multivariate time series classification is an important computational task +arising in applications where data is recorded over time and over multiple +channels. For example, a smartwatch can record the acceleration and orientation +of a person's motion, and these signals are recorded as multivariate time +series. We can classify this data to understand and predict human movement and +various properties such as fitness levels. In many applications classification +alone is not enough, we often need to classify but also understand what the +model learns (e.g., why was a prediction given, based on what information in +the data). The main focus of this paper is on analysing and evaluating +explanation methods tailored to Multivariate Time Series Classification (MTSC). +We focus on saliency-based explanation methods that can point out the most +relevant channels and time series points for the classification decision. We +analyse two popular and accurate multivariate time series classifiers, ROCKET +and dResNet, as well as two popular explanation methods, SHAP and dCAM. We +study these methods on 3 synthetic datasets and 2 real-world datasets and +provide a quantitative and qualitative analysis of the explanations provided. +We find that flattening the multivariate datasets by concatenating the channels +works as well as using multivariate classifiers directly and adaptations of +SHAP for MTSC work quite well. Additionally, we also find that the popular +synthetic datasets we used are not suitable for time series analysis. + +
+
+ comment: Accepted at AALTD '23 +
+
+
+
+
+ + ☆ Ensemble of Counterfactual Explainers + + +
+ In eXplainable Artificial Intelligence (XAI), several counterfactual +explainers have been proposed, each focusing on some desirable properties of +counterfactual instances: minimality, actionability, stability, diversity, +plausibility, discriminative power. We propose an ensemble of counterfactual +explainers that boosts weak explainers, which provide only a subset of such +properties, to a powerful method covering all of them. The ensemble runs weak +explainers on a sample of instances and of features, and it combines their +results by exploiting a diversity-driven selection function. The method is +model-agnostic and, through a wrapping approach based on autoencoders, it is +also data-agnostic. + +
+
+
+
+
+ + ☆ Is visual explanation with Grad-CAM more reliable for deeper neural + networks? a case study with automatic pneumothorax diagnosis + + +
+ While deep learning techniques have provided the state-of-the-art performance +in various clinical tasks, explainability regarding their decision-making +process can greatly enhance the credence of these methods for safer and quicker +clinical adoption. With high flexibility, Gradient-weighted Class Activation +Mapping (Grad-CAM) has been widely adopted to offer intuitive visual +interpretation of various deep learning models' reasoning processes in +computer-assisted diagnosis. However, despite the popularity of the technique, +there is still a lack of systematic study on Grad-CAM's performance on +different deep learning architectures. In this study, we investigate its +robustness and effectiveness across different popular deep learning models, +with a focus on the impact of the networks' depths and architecture types, by +using a case study of automatic pneumothorax diagnosis in X-ray scans. Our +results show that deeper neural networks do not necessarily contribute to a +strong improvement of pneumothorax diagnosis accuracy, and the effectiveness of +GradCAM also varies among different network architectures. + +
+
+
+
+
+ + ☆ ABS-SGD: A Delayed Synchronous Stochastic Gradient Descent Algorithm + with Adaptive Batch Size for Heterogeneous GPU Clusters + + +
+ As the size of models and datasets grows, it has become increasingly common +to train models in parallel. However, existing distributed stochastic gradient +descent (SGD) algorithms suffer from insufficient utilization of computational +resources and poor convergence in heterogeneous clusters. In this paper, we +propose a delayed synchronous SGD algorithm with adaptive batch size (ABS-SGD) +for heterogeneous GPU clusters. In ABS-SGD, workers perform global +synchronization to accumulate delayed gradients and use the accumulated delayed +gradients to update parameters. While workers are performing global +synchronization for delayed gradients, they perform the computation of the next +batch without specifying batch size in advance, which lasts until the next +global synchronization starts, realizing the full utilization of computational +resources. Since the gradient delay is only one iteration, the stale gradient +problem can be alleviated. We theoretically prove the convergence of ABS-SGD in +heterogeneous clusters. Extensive experiments in three types of heterogeneous +clusters demonstrate that ABS-SGD can make full use of computational resources +and accelerate model convergence: When training ResNet18 network with 4 +workers, ABS-SGD increases the convergence speed by 1.30x on average compared +with the best baseline algorithm. + +
+
+ comment: 15 pages, 3 figures +
+
+
+
+
+ + ☆ On the improvement of model-predictive controllers + + +
+ This article investigates synthetic model-predictive control (MPC) problems +to demonstrate that an increased precision of the internal prediction model +(PM) automatially entails an improvement of the controller as a whole. In +contrast to reinforcement learning (RL), MPC uses the PM to predict subsequent +states of the controlled system (CS), instead of directly recommending suitable +actions. To assess how the precision of the PM translates into the quality of +the model-predictive controller, we compare a DNN-based PM to the optimal +baseline PM for three well-known control problems of varying complexity. The +baseline PM achieves perfect accuracy by accessing the simulation of the CS +itself. Based on the obtained results, we argue that an improvement of the PM +will always improve the controller as a whole, without considering the impact +of other components such as action selection (which, in this article, relies on +evolutionary optimization). + +
+
+
+
+
+ + ☆ Uncertainty Aware Training to Improve Deep Learning Model Calibration + for Classification of Cardiac MR Images + + +
+ Quantifying uncertainty of predictions has been identified as one way to +develop more trustworthy artificial intelligence (AI) models beyond +conventional reporting of performance metrics. When considering their role in a +clinical decision support setting, AI classification models should ideally +avoid confident wrong predictions and maximise the confidence of correct +predictions. Models that do this are said to be well-calibrated with regard to +confidence. However, relatively little attention has been paid to how to +improve calibration when training these models, i.e., to make the training +strategy uncertainty-aware. In this work we evaluate three novel +uncertainty-aware training strategies comparing against two state-of-the-art +approaches. We analyse performance on two different clinical applications: +cardiac resynchronisation therapy (CRT) response prediction and coronary artery +disease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The +best-performing model in terms of both classification accuracy and the most +common calibration measure, expected calibration error (ECE) was the Confidence +Weight method, a novel approach that weights the loss of samples to explicitly +penalise confident incorrect predictions. The method reduced the ECE by 17% for +CRT response prediction and by 22% for CAD diagnosis when compared to a +baseline classifier in which no uncertainty-aware strategy was included. In +both applications, as well as reducing the ECE there was a slight increase in +accuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD +diagnosis respectively. However, our analysis showed a lack of consistency in +terms of optimal models when using different calibration measures. This +indicates the need for careful consideration of performance metrics when +training and selecting models for complex high-risk applications in healthcare. + +
+
+
+
+
+ + ☆ Biquality Learning: a Framework to Design Algorithms Dealing with + Closed-Set Distribution Shifts + + +
+ Training machine learning models from data with weak supervision and dataset +shifts is still challenging. Designing algorithms when these two situations +arise has not been explored much, and existing algorithms cannot always handle +the most complex distributional shifts. We think the biquality data setup is a +suitable framework for designing such algorithms. Biquality Learning assumes +that two datasets are available at training time: a trusted dataset sampled +from the distribution of interest and the untrusted dataset with dataset shifts +and weaknesses of supervision (aka distribution shifts). The trusted and +untrusted datasets available at training time make designing algorithms dealing +with any distribution shifts possible. We propose two methods, one inspired by +the label noise literature and another by the covariate shift literature for +biquality learning. We experiment with two novel methods to synthetically +introduce concept drift and class-conditional shifts in real-world datasets +across many of them. We opened some discussions and assessed that developing +biquality learning algorithms robust to distributional changes remains an +interesting problem for future research. + +
+
+
+
+
+ + ☆ Evaluation and Analysis of Hallucination in Large Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) have recently achieved remarkable +success. However, LVLMs are still plagued by the hallucination problem, which +limits the practicality in many scenarios. Hallucination refers to the +information of LVLMs' responses that does not exist in the visual input, which +poses potential risks of substantial consequences. There has been limited work +studying hallucination evaluation in LVLMs. In this paper, we propose +Hallucination Evaluation based on Large Language Models (HaELM), an LLM-based +hallucination evaluation framework. HaELM achieves an approximate 95% +performance comparable to ChatGPT and has additional advantages including low +cost, reproducibility, privacy preservation and local deployment. Leveraging +the HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we +analyze the factors contributing to hallucination in LVLMs and offer helpful +suggestions to mitigate the hallucination problem. Our training data and human +annotation hallucination data will be made public soon. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Mixup-Augmented Meta-Learning for Sample-Efficient Fine-Tuning of + Protein Simulators + + +
+ Molecular dynamics simulations have emerged as a fundamental instrument for +studying biomolecules. At the same time, it is desirable to perform simulations +of a collection of particles under various conditions in which the molecules +can fluctuate. In this paper, we explore and adapt the soft prompt-based +learning method to molecular dynamics tasks. Our model can remarkably +generalize to unseen and out-of-distribution scenarios with limited training +data. While our work focuses on temperature as a test case, the versatility of +our approach allows for efficient simulation through any continuous dynamic +conditions, such as pressure and volumes. Our framework has two stages: 1) +Pre-trains with data mixing technique, augments molecular structure data and +temperature prompts, then applies a curriculum learning method by increasing +the ratio of them smoothly. 2) Meta-learning-based fine-tuning framework +improves sample-efficiency of fine-tuning process and gives the soft +prompt-tuning better initialization points. Comprehensive experiments reveal +that our framework excels in accuracy for in-domain data and demonstrates +strong generalization capabilities for unseen and out-of-distribution samples. + +
+
+
+
+
+ + ☆ Stochastic Graph Bandit Learning with Side-Observations + + +
+ In this paper, we investigate the stochastic contextual bandit with general +function space and graph feedback. We propose an algorithm that addresses this +problem by adapting to both the underlying graph structures and reward gaps. To +the best of our knowledge, our algorithm is the first to provide a +gap-dependent upper bound in this stochastic setting, bridging the research gap +left by the work in [35]. In comparison to [31,33,35], our method offers +improved regret upper bounds and does not require knowledge of graphical +quantities. We conduct numerical experiments to demonstrate the computational +efficiency and effectiveness of our approach in terms of regret upper bounds. +These findings highlight the significance of our algorithm in advancing the +field of stochastic contextual bandits with graph feedback, opening up avenues +for practical applications in various domains. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2010.03104 by other authors +
+
+
+
+
+ + ☆ How Faithful are Self-Explainable GNNs? + + +
+ Self-explainable deep neural networks are a recent class of models that can +output ante-hoc local explanations that are faithful to the model's reasoning, +and as such represent a step forward toward filling the gap between +expressiveness and interpretability. Self-explainable graph neural networks +(GNNs) aim at achieving the same in the context of graph data. This begs the +question: do these models fulfill their implicit guarantees in terms of +faithfulness? In this extended abstract, we analyze the faithfulness of several +self-explainable GNNs using different measures of faithfulness, identify +several limitations -- both in the models themselves and in the evaluation +metrics -- and outline possible ways forward. + +
+
+
+
+
+ + ☆ Group-Conditional Conformal Prediction via Quantile Regression + Calibration for Crop and Weed Classification + + +
+ As deep learning predictive models become an integral part of a large +spectrum of precision agricultural systems, a barrier to the adoption of such +automated solutions is the lack of user trust in these highly complex, opaque +and uncertain models. Indeed, deep neural networks are not equipped with any +explicit guarantees that can be used to certify the system's performance, +especially in highly varying uncontrolled environments such as the ones +typically faced in computer vision for agriculture.Fortunately, certain methods +developed in other communities can prove to be important for agricultural +applications. This article presents the conformal prediction framework that +provides valid statistical guarantees on the predictive performance of any +black box prediction machine, with almost no assumptions, applied to the +problem of deep visual classification of weeds and crops in real-world +conditions. The framework is exposed with a focus on its practical aspects and +special attention accorded to the Adaptive Prediction Sets (APS) approach that +delivers marginal guarantees on the model's coverage. Marginal results are then +shown to be insufficient to guarantee performance on all groups of individuals +in the population as characterized by their environmental and pedo-climatic +auxiliary data gathered during image acquisition.To tackle this shortcoming, +group-conditional conformal approaches are presented: the ''classical'' method +that consists of iteratively applying the APS procedure on all groups, and a +proposed elegant reformulation and implementation of the procedure using +quantile regression on group membership indicators. Empirical results showing +the validity of the proposed approach are presented and compared to the +marginal APS then discussed. + +
+
+
+
+
+ + ☆ Can We Rely on AI? + + +
+ Over the last decade, adversarial attack algorithms have revealed +instabilities in deep learning tools. These algorithms raise issues regarding +safety, reliability and interpretability in artificial intelligence; especially +in high risk settings. From a practical perspective, there has been a war of +escalation between those developing attack and defence strategies. At a more +theoretical level, researchers have also studied bigger picture questions +concerning the existence and computability of attacks. Here we give a brief +overview of the topic, focusing on aspects that are likely to be of interest to +researchers in applied and computational mathematics. + +
+
+
+
+
+ + ☆ Using deep learning for an automatic detection and classification of the + vascular bifurcations along the Circle of Willis + + +
+ Most of the intracranial aneurysms (ICA) occur on a specific portion of the +cerebral vascular tree named the Circle of Willis (CoW). More particularly, +they mainly arise onto fifteen of the major arterial bifurcations constituting +this circular structure. Hence, for an efficient and timely diagnosis it is +critical to develop some methods being able to accurately recognize each +Bifurcation of Interest (BoI). Indeed, an automatic extraction of the +bifurcations presenting the higher risk of developing an ICA would offer the +neuroradiologists a quick glance at the most alarming areas. Due to the recent +efforts on Artificial Intelligence, Deep Learning turned out to be the best +performing technology for many pattern recognition tasks. Moreover, various +methods have been particularly designed for medical image analysis purposes. +This study intends to assist the neuroradiologists to promptly locate any +bifurcation presenting a high risk of ICA occurrence. It can be seen as a +Computer Aided Diagnosis scheme, where the Artificial Intelligence facilitates +the access to the regions of interest within the MRI. In this work, we propose +a method for a fully automatic detection and recognition of the bifurcations of +interest forming the Circle of Willis. Several neural networks architectures +have been tested, and we thoroughly evaluate the bifurcation recognition rate. + +
+
+
+
+
+ + ☆ Exploring Model Transferability through the Lens of Potential Energy ICCV 2023 + + +
+ Transfer learning has become crucial in computer vision tasks due to the vast +availability of pre-trained deep learning models. However, selecting the +optimal pre-trained model from a diverse pool for a specific downstream task +remains a challenge. Existing methods for measuring the transferability of +pre-trained models rely on statistical correlations between encoded static +features and task labels, but they overlook the impact of underlying +representation dynamics during fine-tuning, leading to unreliable results, +especially for self-supervised models. In this paper, we present an insightful +physics-inspired approach named PED to address these challenges. We reframe the +challenge of model selection through the lens of potential energy and directly +model the interaction forces that influence fine-tuning dynamics. By capturing +the motion of dynamic representations to decline the potential energy within a +force-driven physical model, we can acquire an enhanced and more stable +observation for estimating transferability. The experimental results on 10 +downstream tasks and 12 self-supervised models demonstrate that our approach +can seamlessly integrate into existing ranking techniques and enhance their +performances, revealing its effectiveness for the model selection task and its +potential for understanding the mechanism in transfer learning. Code will be +available at https://github.com/lixiaotong97/PED. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Advancing Adversarial Robustness Through Adversarial Logit Update + + +
+ Deep Neural Networks are susceptible to adversarial perturbations. +Adversarial training and adversarial purification are among the most widely +recognized defense strategies. Although these methods have different underlying +logic, both rely on absolute logit values to generate label predictions. In +this study, we theoretically analyze the logit difference around successful +adversarial attacks from a theoretical point of view and propose a new +principle, namely Adversarial Logit Update (ALU), to infer adversarial sample's +labels. Based on ALU, we introduce a new classification paradigm that utilizes +pre- and post-purification logit differences for model's adversarial robustness +boost. Without requiring adversarial or additional data for model training, our +clean data synthesis model can be easily applied to various pre-trained models +for both adversarial sample detection and ALU-based data classification. +Extensive experiments on both CIFAR-10, CIFAR-100, and tiny-ImageNet datasets +show that even with simple components, the proposed solution achieves superior +robustness performance compared to state-of-the-art methods against a wide +range of adversarial attacks. Our python implementation is submitted in our +Supplementary document and will be published upon the paper's acceptance. + +
+
+
+
+
+ + ☆ MadSGM: Multivariate Anomaly Detection with Score-based Generative + Models + + +
+ The time-series anomaly detection is one of the most fundamental tasks for +time-series. Unlike the time-series forecasting and classification, the +time-series anomaly detection typically requires unsupervised (or +self-supervised) training since collecting and labeling anomalous observations +are difficult. In addition, most existing methods resort to limited forms of +anomaly measurements and therefore, it is not clear whether they are optimal in +all circumstances. To this end, we present a multivariate time-series anomaly +detector based on score-based generative models, called MadSGM, which considers +the broadest ever set of anomaly measurement factors: i) reconstruction-based, +ii) density-based, and iii) gradient-based anomaly measurements. We also design +a conditional score network and its denoising score matching loss for the +time-series anomaly detection. Experiments on five real-world benchmark +datasets illustrate that MadSGM achieves the most robust and accurate +predictions. + +
+
+
+
+
+ + ☆ OEBench: Investigating Open Environment Challenges in Real-World + Relational Data Streams + + +
+ Relational datasets are widespread in real-world scenarios and are usually +delivered in a streaming fashion. This type of data stream can present unique +challenges, such as distribution drifts, outliers, emerging classes, and +changing features, which have recently been described as open environment +challenges for machine learning. While some work has been done on incremental +learning for data streams, their evaluations are mostly conducted with manually +partitioned datasets. Moreover, while several real-world streaming datasets are +available, it is uncertain whether these open environment challenges are +prevalent and how existing incremental learning algorithms perform on real +datasets. To fill this gap, we develop an Open Environment Benchmark named +OEBench to evaluate open environment challenges in relational data streams. +Specifically, we investigate 55 real-world streaming datasets and establish +that open environment scenarios are indeed widespread in real-world datasets, +which presents significant challenges for stream learning algorithms. Through +benchmarks, we find that increased data quantity may not consistently enhance +the model accuracy when applied in open environment scenarios, where machine +learning models can be significantly compromised by distribution shifts, +anomalies, or untrustworthy data within real-world data streams. The current +techniques are insufficient in effectively mitigating these challenges posed by +open environments. Thus, it is promising to conduct more researches to address +real-world new challenges of open environment scenarios. + +
+
+
+
+
+ + ☆ Taxonomic Loss for Morphological Glossing of Low-Resource Languages + + +
+ Morpheme glossing is a critical task in automated language documentation and +can benefit other downstream applications greatly. While state-of-the-art +glossing systems perform very well for languages with large amounts of existing +data, it is more difficult to create useful models for low-resource languages. +In this paper, we propose the use of a taxonomic loss function that exploits +morphological information to make morphological glossing more performant when +data is scarce. We find that while the use of this loss function does not +outperform a standard loss function with regards to single-label prediction +accuracy, it produces better predictions when considering the top-n predicted +labels. We suggest this property makes the taxonomic loss function useful in a +human-in-the-loop annotation setting. + +
+
+
+
+
+ + ☆ iBARLE: imBalance-Aware Room Layout Estimation + + +
+ Room layout estimation predicts layouts from a single panorama. It requires +datasets with large-scale and diverse room shapes to train the models. However, +there are significant imbalances in real-world datasets including the +dimensions of layout complexity, camera locations, and variation in scene +appearance. These issues considerably influence the model training performance. +In this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE) +framework to address these issues. iBARLE consists of (1) Appearance Variation +Generation (AVG) module, which promotes visual appearance domain +generalization, (2) Complex Structure Mix-up (CSMix) module, which enhances +generalizability w.r.t. room structure, and (3) a gradient-based layout +objective function, which allows more effective accounting for occlusions in +complex layouts. All modules are jointly trained and help each other to achieve +the best performance. Experiments and ablation studies based on +ZInD~\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art +performance compared with other layout estimation baselines. + +
+
+
+
+
+ + ☆ Large language models converge toward human-like concept organization + + +
+ Large language models show human-like performance in knowledge extraction, +reasoning and dialogue, but it remains controversial whether this performance +is best explained by memorization and pattern matching, or whether it reflects +human-like inferential semantics and world knowledge. Knowledge bases such as +WikiData provide large-scale, high-quality representations of inferential +semantics and world knowledge. We show that large language models learn to +organize concepts in ways that are strikingly similar to how concepts are +organized in such knowledge bases. Knowledge bases model collective, +institutional knowledge, and large language models seem to induce such +knowledge from raw text. We show that bigger and better models exhibit more +human-like concept organization, across four families of language models and +three knowledge graph embeddings. + +
+
+
+
+
+ + ☆ Massively Parallel Continuous Local Search for Hybrid SAT Solving on + GPUs + + +
+ Although state-of-the-art (SOTA) SAT solvers based on conflict-driven clause +learning (CDCL) have achieved remarkable engineering success, their sequential +nature limits the parallelism that may be extracted for acceleration on +platforms such as the graphics processing unit (GPU). In this work, we propose +FastFourierSAT, a highly parallel hybrid SAT solver based on gradient-driven +continuous local search (CLS). This is realized by a novel parallel algorithm +inspired by the Fast Fourier Transform (FFT)-based convolution for computing +the elementary symmetric polynomials (ESPs), which is the major computational +task in previous CLS methods. The complexity of our algorithm matches the best +previous result. Furthermore, the substantial parallelism inherent in our +algorithm can leverage the GPU for acceleration, demonstrating significant +improvement over the previous CLS approaches. We also propose to incorporate +the restart heuristics in CLS to improve search efficiency. We compare our +approach with the SOTA parallel SAT solvers on several benchmarks. Our results +show that FastFourierSAT computes the gradient 100+ times faster than previous +prototypes implemented on CPU. Moreover, FastFourierSAT solves most instances +and demonstrates promising performance on larger-size instances. + +
+
+
+
+
+ + ☆ Exploiting Problem Geometry in Safe Linear Bandits + + +
+ The safe linear bandit problem is a version of the classic linear bandit +problem where the learner's actions must satisfy an uncertain linear constraint +at all rounds. Due its applicability to many real-world settings, this problem +has received considerable attention in recent years. We find that by exploiting +the geometry of the specific problem setting, we can achieve improved regret +guarantees for both well-separated problem instances and action sets that are +finite star convex sets. Additionally, we propose a novel algorithm for this +setting that chooses problem parameters adaptively and enjoys at least as good +regret guarantees as existing algorithms. Lastly, we introduce a generalization +of the safe linear bandit setting where the constraints are convex and adapt +our algorithms and analyses to this setting by leveraging a novel +convex-analysis based approach. Simulation results show improved performance +over existing algorithms for a variety of randomly sampled settings. + +
+
+ comment: 38 pages, 4 figures +
+
+
+
+
+ + ☆ WSAM: Visual Explanations from Style Augmentation as Adversarial + Attacker and Their Influence in Image Classification + + +
+ Currently, style augmentation is capturing attention due to convolutional +neural networks (CNN) being strongly biased toward recognizing textures rather +than shapes. Most existing styling methods either perform a low-fidelity style +transfer or a weak style representation in the embedding vector. This paper +outlines a style augmentation algorithm using stochastic-based sampling with +noise addition to improving randomization on a general linear transformation +for style transfer. With our augmentation strategy, all models not only present +incredible robustness against image stylizing but also outperform all previous +methods and surpass the state-of-the-art performance for the STL-10 dataset. In +addition, we present an analysis of the model interpretations under different +style variations. At the same time, we compare comprehensive experiments +demonstrating the performance when applied to deep neural architectures in +training settings. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Incorporating Neuro-Inspired Adaptability for Continual Learning in + Artificial Intelligence + + +
+ Continual learning aims to empower artificial intelligence (AI) with strong +adaptability to the real world. For this purpose, a desirable solution should +properly balance memory stability with learning plasticity, and acquire +sufficient compatibility to capture the observed distributions. Existing +advances mainly focus on preserving memory stability to overcome catastrophic +forgetting, but remain difficult to flexibly accommodate incremental changes as +biological intelligence (BI) does. By modeling a robust Drosophila learning +system that actively regulates forgetting with multiple learning modules, here +we propose a generic approach that appropriately attenuates old memories in +parameter distributions to improve learning plasticity, and accordingly +coordinates a multi-learner architecture to ensure solution compatibility. +Through extensive theoretical and empirical validation, our approach not only +clearly enhances the performance of continual learning, especially over +synaptic regularization methods in task-incremental settings, but also +potentially advances the understanding of neurological adaptive mechanisms, +serving as a novel paradigm to progress AI and BI together. + +
+
+
+
+
+ + ☆ Constructive Incremental Learning for Fault Diagnosis of Rolling + Bearings with Ensemble Domain Adaptation + + +
+ Given the prevalence of rolling bearing fault diagnosis as a practical issue +across various working conditions, the limited availability of samples +compounds the challenge. Additionally, the complexity of the external +environment and the structure of rolling bearings often manifests faults +characterized by randomness and fuzziness, hindering the effective extraction +of fault characteristics and restricting the accuracy of fault diagnosis. To +overcome these problems, this paper presents a novel approach termed +constructive Incremental learning-based ensemble domain adaptation (CIL-EDA) +approach. Specifically, it is implemented on stochastic configuration networks +(SCN) to constructively improve its adaptive performance in multi-domains. +Concretely, a cloud feature extraction method is employed in conjunction with +wavelet packet decomposition (WPD) to capture the uncertainty of fault +information from multiple resolution aspects. Subsequently, constructive +Incremental learning-based domain adaptation (CIL-DA) is firstly developed to +enhance the cross-domain learning capability of each hidden node through domain +matching and construct a robust fault classifier by leveraging limited labeled +data from both target and source domains. Finally, fault diagnosis results are +obtained by a majority voting of CIL-EDA which integrates CIL-DA and parallel +ensemble learning. Experimental results demonstrate that our CIL-DA outperforms +several domain adaptation methods and CIL-EDA consistently outperforms +state-of-art fault diagnosis methods in few-shot scenarios. + +
+
+
+
+
+ + ☆ Sub-universal variational circuits for combinatorial optimization + problems + + +
+ Quantum variational circuits have gained significant attention due to their +applications in the quantum approximate optimization algorithm and quantum +machine learning research. This work introduces a novel class of classical +probabilistic circuits designed for generating approximate solutions to +combinatorial optimization problems constructed using two-bit stochastic +matrices. Through a numerical study, we investigate the performance of our +proposed variational circuits in solving the Max-Cut problem on various graphs +of increasing sizes. Our classical algorithm demonstrates improved performance +for several graph types to the quantum approximate optimization algorithm. Our +findings suggest that evaluating the performance of quantum variational +circuits against variational circuits with sub-universal gate sets is a +valuable benchmark for identifying areas where quantum variational circuits can +excel. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient labeling of solar flux evolution videos by a deep learning + model + + +
+ Machine learning (ML) is becoming a critical tool for interrogation of large +complex data. Labeling, defined as the process of adding meaningful +annotations, is a crucial step of supervised ML. However, labeling datasets is +time consuming. Here we show that convolutional neural networks (CNNs), trained +on crudely labeled astronomical videos, can be leveraged to improve the quality +of data labeling and reduce the need for human intervention. We use videos of +the solar magnetic field, crudely labeled into two classes: emergence or +non-emergence of bipolar magnetic regions (BMRs), based on their first +detection on the solar disk. We train CNNs using crude labels, manually verify, +correct labeling vs. CNN disagreements, and repeat this process until +convergence. Traditionally, flux emergence labelling is done manually. We find +that a high-quality labeled dataset, derived through this iterative process, +reduces the necessary manual verification by 50%. Furthermore, by gradually +masking the videos and looking for maximum change in CNN inference, we locate +BMR emergence time without retraining the CNN. This demonstrates the +versatility of CNNs for simplifying the challenging task of labeling complex +dynamic events. + +
+
+ comment: 16 pages, 7 figures, published in Nature Astronomy, June 27, 2022 +
+
+
+
+
+ + ☆ Distributed multi-agent target search and tracking with Gaussian process + and reinforcement learning + + +
+ Deploying multiple robots for target search and tracking has many practical +applications, yet the challenge of planning over unknown or partially known +targets remains difficult to address. With recent advances in deep learning, +intelligent control techniques such as reinforcement learning have enabled +agents to learn autonomously from environment interactions with little to no +prior knowledge. Such methods can address the exploration-exploitation tradeoff +of planning over unknown targets in a data-driven manner, eliminating the +reliance on heuristics typical of traditional approaches and streamlining the +decision-making pipeline with end-to-end training. In this paper, we propose a +multi-agent reinforcement learning technique with target map building based on +distributed Gaussian process. We leverage the distributed Gaussian process to +encode belief over the target locations and efficiently plan over unknown +targets. We evaluate the performance and transferability of the trained policy +in simulation and demonstrate the method on a swarm of micro unmanned aerial +vehicles with hardware experiments. + +
+
+ comment: 10 pages, 6 figures; preprint submitted to IJCAS; first two authors + contributed equally +
+
+
+
+
+ + ☆ Reprogramming under constraints: Revisiting efficient and reliable + transferability of lottery tickets + + +
+ In the era of foundation models with huge pre-training budgets, the +downstream tasks have been shifted to the narrative of efficient and fast +adaptation. For classification-based tasks in the domain of computer vision, +the two most efficient approaches have been linear probing (LP) and visual +prompting/reprogramming (VP); the former aims to learn a classifier in the form +of a linear head on the features extracted by the pre-trained model, while the +latter maps the input data to the domain of the source data on which the model +was originally pre-trained on. Although extensive studies have demonstrated the +differences between LP and VP in terms of downstream performance, we explore +the capabilities of the two aforementioned methods via the sparsity axis: (a) +Data sparsity: the impact of few-shot adaptation and (b) Model sparsity: the +impact of lottery tickets (LT). We demonstrate that LT are not universal +reprogrammers, i.e., for certain target datasets, reprogramming an LT yields +significantly lower performance than the reprogrammed dense model although +their corresponding upstream performance is similar. Further, we demonstrate +that the calibration of dense models is always superior to that of their +lottery ticket counterparts under both LP and VP regimes. Our empirical study +opens a new avenue of research into VP for sparse models and encourages further +understanding of the performance beyond the accuracy achieved by VP under +constraints of sparsity. Code and logs can be accessed at +\url{https://github.com/landskape-ai/Reprogram_LT}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Streaming Compression of Scientific Data via weak-SINDy + + +
+ In this paper a streaming weak-SINDy algorithm is developed specifically for +compressing streaming scientific data. The production of scientific data, +either via simulation or experiments, is undergoing an stage of exponential +growth, which makes data compression important and often necessary for storing +and utilizing large scientific data sets. As opposed to classical ``offline" +compression algorithms that perform compression on a readily available data +set, streaming compression algorithms compress data ``online" while the data +generated from simulation or experiments is still flowing through the system. +This feature makes streaming compression algorithms well-suited for scientific +data compression, where storing the full data set offline is often infeasible. +This work proposes a new streaming compression algorithm, streaming weak-SINDy, +which takes advantage of the underlying data characteristics during +compression. The streaming weak-SINDy algorithm constructs feature matrices and +target vectors in the online stage via a streaming integration method in a +memory efficient manner. The feature matrices and target vectors are then used +in the offline stage to build a model through a regression process that aims to +recover equations that govern the evolution of the data. For compressing +high-dimensional streaming data, we adopt a streaming proper orthogonal +decomposition (POD) process to reduce the data dimension and then use the +streaming weak-SINDy algorithm to compress the temporal data of the POD +expansion. We propose modifications to the streaming weak-SINDy algorithm to +accommodate the dynamically updated POD basis. By combining the built model +from the streaming weak-SINDy algorithm and a small amount of data samples, the +full data flow could be reconstructed accurately at a low memory cost, as shown +in the numerical tests. + +
+
+
+
+
+ + ☆ Robust Open-Set Spoken Language Identification and the CU MultiLang + Dataset + + +
+ Most state-of-the-art spoken language identification models are closed-set; +in other words, they can only output a language label from the set of classes +they were trained on. Open-set spoken language identification systems, however, +gain the ability to detect when an input exhibits none of the original +languages. In this paper, we implement a novel approach to open-set spoken +language identification that uses MFCC and pitch features, a TDNN model to +extract meaningful feature embeddings, confidence thresholding on softmax +outputs, and LDA and pLDA for learning to classify new unknown languages. We +present a spoken language identification system that achieves 91.76% accuracy +on trained languages and has the capability to adapt to unknown languages on +the fly. To that end, we also built the CU MultiLang Dataset, a large and +diverse multilingual speech corpus which was used to train and evaluate our +system. + +
+
+ comment: 6pages, 1 table, 6 figures +
+
+
+
+
+ + ☆ Low-bit Quantization for Deep Graph Neural Networks with + Smoothness-aware Message Propagation CIKM2023 + + +
+ Graph Neural Network (GNN) training and inference involve significant +challenges of scalability with respect to both model sizes and number of +layers, resulting in degradation of efficiency and accuracy for large and deep +GNNs. We present an end-to-end solution that aims to address these challenges +for efficient GNNs in resource constrained environments while avoiding the +oversmoothing problem in deep GNNs. We introduce a quantization based approach +for all stages of GNNs, from message passing in training to node +classification, compressing the model and enabling efficient processing. The +proposed GNN quantizer learns quantization ranges and reduces the model size +with comparable accuracy even under low-bit quantization. To scale with the +number of layers, we devise a message propagation mechanism in training that +controls layer-wise changes of similarities between neighboring nodes. This +objective is incorporated into a Lagrangian function with constraints and a +differential multiplier method is utilized to iteratively find optimal +embeddings. This mitigates oversmoothing and suppresses the quantization error +to a bound. Significant improvements are demonstrated over state-of-the-art +quantization methods and deep GNN approaches in both full-precision and +quantized models. The proposed quantizer demonstrates superior performance in +INT2 configurations across all stages of GNN, achieving a notable level of +accuracy. In contrast, existing quantization approaches fail to generate +satisfactory accuracy levels. Finally, the inference with INT2 and INT4 +representations exhibits a speedup of 5.11 $\times$ and 4.70 $\times$ compared +to full precision counterparts, respectively. + +
+
+ comment: To appear in CIKM2023 +
+
+
+
+
+ + ☆ Improving Reinforcement Learning Training Regimes for Social Robot + Navigation + + +
+ In order for autonomous mobile robots to navigate in human spaces, they must +abide by our social norms. Reinforcement learning (RL) has emerged as an +effective method to train robot navigation policies that are able to respect +these norms. However, a large portion of existing work in the field conducts +both RL training and testing in simplistic environments. This limits the +generalization potential of these models to unseen environments, and the +meaningfulness of their reported results. We propose a method to improve the +generalization performance of RL social navigation methods using curriculum +learning. By employing multiple environment types and by modeling pedestrians +using multiple dynamics models, we are able to progressively diversify and +escalate difficulty in training. Our results show that the use of curriculum +learning in training can be used to achieve better generalization performance +than previous training methods. We also show that results presented in many +existing state-of-the art RL social navigation works do not evaluate their +methods outside of their training environments, and thus do not reflect their +policies' failure to adequately generalize to out-of-distribution scenarios. In +response, we validate our training approach on larger and more crowded testing +environments than those used in training, allowing for more meaningful +measurements of model performance. + +
+
+
+
+
+ + ☆ Bridging Distribution Learning and Image Clustering in High-dimensional + Space + + +
+ Distribution learning focuses on learning the probability density function +from a set of data samples. In contrast, clustering aims to group similar +objects together in an unsupervised manner. Usually, these two tasks are +considered unrelated. However, the relationship between the two may be +indirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge. +In this paper, we focus on exploring the correlation between distribution +learning and clustering, with the motivation to fill the gap between these two +fields, utilizing an autoencoder (AE) to encode images into a high-dimensional +latent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler +(KL) divergence loss are used to fit the Gaussian components of the GMM and +learn the data distribution. Finally, image clustering is achieved through each +Gaussian component of GMM. Yet, the "curse of dimensionality" poses severe +challenges for most clustering algorithms. Compared with the classic +Expectation-Maximization (EM) Algorithm, experimental results show that MCMarg +and KL divergence can greatly alleviate the difficulty. Based on the +experimental results, we believe distribution learning can exploit the +potential of GMM in image clustering within high-dimensional space. + +
+
+
+
+
+ + ☆ Deep Reinforcement Learning Based Framework for Mobile Energy + Disseminator Dispatching to Charge On-the-Road Electric Vehicles + + +
+ The exponential growth of electric vehicles (EVs) presents novel challenges +in preserving battery health and in addressing the persistent problem of +vehicle range anxiety. To address these concerns, wireless charging, +particularly, Mobile Energy Disseminators (MEDs) have emerged as a promising +solution. The MED is mounted behind a large vehicle and charges all +participating EVs within a radius upstream of it. Unfortuantely, during such +V2V charging, the MED and EVs inadvertently form platoons, thereby occupying +multiple lanes and impairing overall corridor travel efficiency. In addition, +constrained budgets for MED deployment necessitate the development of an +effective dispatching strategy to determine optimal timing and locations for +introducing the MEDs into traffic. This paper proposes a deep reinforcement +learning (DRL) based methodology to develop a vehicle dispatching framework. In +the first component of the framework, we develop a realistic reinforcement +learning environment termed "ChargingEnv" which incorporates a reliable +charging simulation system that accounts for common practical issues in +wireless charging deployment, specifically, the charging panel misalignment. +The second component, the Proximal-Policy Optimization (PPO) agent, is trained +to control MED dispatching through continuous interactions with ChargingEnv. +Numerical experiments were carried out to demonstrate the demonstrate the +efficacy of the proposed MED deployment decision processor. The experiment +results suggest that the proposed model can significantly enhance EV travel +range while efficiently deploying a optimal number of MEDs. The proposed model +is found to be not only practical in its applicability but also has promises of +real-world effectiveness. The proposed model can help travelers to maximize EV +range and help road agencies or private-sector vendors to manage the deployment +of MEDs efficiently. + +
+
+ comment: Submitted for presentation only at the 2024 Annual Meeting of the + Transportation Research Board +
+
+
+
+
+ + ☆ Ensuring User-side Fairness in Dynamic Recommender Systems + + +
+ User-side group fairness is crucial for modern recommender systems, as it +aims to alleviate performance disparity between groups of users defined by +sensitive attributes such as gender, race, or age. We find that the disparity +tends to persist or even increase over time. This calls for effective ways to +address user-side fairness in a dynamic environment, which has been +infrequently explored in the literature. However, fairness-constrained +re-ranking, a typical method to ensure user-side fairness (i.e., reducing +performance disparity), faces two fundamental challenges in the dynamic +setting: (1) non-differentiability of the ranking-based fairness constraint, +which hinders the end-to-end training paradigm, and (2) time-inefficiency, +which impedes quick adaptation to changes in user preferences. In this paper, +we propose FAir Dynamic rEcommender (FADE), an end-to-end framework with +fine-tuning strategy to dynamically alleviate performance disparity. To tackle +the above challenges, FADE uses a novel fairness loss designed to be +differentiable and lightweight to fine-tune model parameters to ensure both +user-side fairness and high-quality recommendations. Via extensive experiments +on the real-world dataset, we empirically demonstrate that FADE effectively and +efficiently reduces performance disparity, and furthermore, FADE improves +overall recommendation quality over time compared to not using any new data. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ A General Recipe for Automated Machine Learning in Practice + + +
+ Automated Machine Learning (AutoML) is an area of research that focuses on +developing methods to generate machine learning models automatically. The idea +of being able to build machine learning models with very little human +intervention represents a great opportunity for the practice of applied machine +learning. However, there is very little information on how to design an AutoML +system in practice. Most of the research focuses on the problems facing +optimization algorithms and leaves out the details of how that would be done in +practice. In this paper, we propose a frame of reference for building general +AutoML systems. Through a narrative review of the main approaches in the area, +our main idea is to distill the fundamental concepts in order to support them +in a single design. Finally, we discuss some open problems related to the +application of AutoML for future research. + +
+
+
+
+
+ + ☆ Clustering Without an Eigengap + + +
+ We study graph clustering in the Stochastic Block Model (SBM) in the presence +of both large clusters and small, unrecoverable clusters. Previous approaches +achieving exact recovery do not allow any small clusters of size $o(\sqrt{n})$, +or require a size gap between the smallest recovered cluster and the largest +non-recovered cluster. We provide an algorithm based on semidefinite +programming (SDP) which removes these requirements and provably recovers large +clusters regardless of the remaining cluster sizes. Mid-sized clusters pose +unique challenges to the analysis, since their proximity to the recovery +threshold makes them highly sensitive to small noise perturbations and +precludes a closed-form candidate solution. We develop novel techniques, +including a leave-one-out-style argument which controls the correlation between +SDP solutions and noise vectors even when the removal of one row of noise can +drastically change the SDP solution. We also develop improved eigenvalue +perturbation bounds of potential independent interest. Using our gap-free +clustering procedure, we obtain efficient algorithms for the problem of +clustering with a faulty oracle with superior query complexities, notably +achieving $o(n^2)$ sample complexity even in the presence of a large number of +small clusters. Our gap-free clustering procedure also leads to improved +algorithms for recursive clustering. Our results extend to certain +heterogeneous probability settings that are challenging for alternative +algorithms. + +
+
+ comment: 68 pages, 1 figure +
+
+
+
+
+ + ☆ Identifying Constitutive Parameters for Complex Hyperelastic Solids + using Physics-Informed Neural Networks + + +
+ Identifying constitutive parameters in engineering and biological materials, +particularly those with intricate geometries and mechanical behaviors, remains +a longstanding challenge. The recent advent of Physics-Informed Neural Networks +(PINNs) offers promising solutions, but current frameworks are often limited to +basic constitutive laws and encounter practical constraints when combined with +experimental data. In this paper, we introduce a new PINN-based framework +designed to identify material parameters for soft materials, specifically those +exhibiting complex constitutive behaviors, under large deformation in plane +stress conditions. Distinctively, our model emphasizes training PINNs with +multi-modal time-dependent experimental datasets consisting of full-field +deformation and loading history, ensuring algorithm robustness even amidst +noisy data. Our results reveal that our framework can accurately identify +constitutive parameters of the incompressible Arruda-Boyce model for samples +with intricate geometries, maintaining an error below 5%, even with an +experimental noise level of 5%. We believe our framework sets the stage for a +transformative approach in modulus identification for complex solids, +especially for those with geometrical and constitutive intricate. + +
+
+ comment: 31 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Hyperbolic Convolutional Neural Networks + + +
+ Deep Learning is mostly responsible for the surge of interest in Artificial +Intelligence in the last decade. So far, deep learning researchers have been +particularly successful in the domain of image processing, where Convolutional +Neural Networks are used. Although excelling at image classification, +Convolutional Neural Networks are quite naive in that no inductive bias is set +on the embedding space for images. Similar flaws are also exhibited by another +type of Convolutional Networks - Graph Convolutional Neural Networks. However, +using non-Euclidean space for embedding data might result in more robust and +explainable models. One example of such a non-Euclidean space is hyperbolic +space. Hyperbolic spaces are particularly useful due to their ability to fit +more data in a low-dimensional space and tree-likeliness properties. These +attractive properties have been previously used in multiple papers which +indicated that they are beneficial for building hierarchical embeddings using +shallow models and, recently, using MLPs and RNNs. + However, no papers have yet suggested a general approach to using Hyperbolic +Convolutional Neural Networks for structured data processing, although these +are the most common examples of data used. Therefore, the goal of this work is +to devise a general recipe for building Hyperbolic Convolutional Neural +Networks. We hypothesize that ability of hyperbolic space to capture hierarchy +in the data would lead to better performance. This ability should be +particularly useful in cases where data has a tree-like structure. Since this +is the case for many existing datasets \citep{wordnet, imagenet, fb15k}, we +argue that such a model would be advantageous both in terms of applications and +future research prospects. + +
+
+
+
+
+ + ☆ RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware + Contextual Reasoning on Whole Slide Images AAAI + + +
+ Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer +in the US. It is diagnosed by manual multi-class tumor grading using a tissue +whole slide image (WSI), which is subjective and suffers from inter-pathologist +variability. We propose an automated weakly-supervised grading approach for +cSCC WSIs that is trained using WSI-level grade and does not require +fine-grained tumor annotations. The proposed model, RACR-MIL, transforms each +WSI into a bag of tiled patches and leverages attention-based multiple-instance +learning to assign a WSI-level grade. We propose three key innovations to +address general as well as cSCC-specific challenges in tumor grading. First, we +leverage spatial and semantic proximity to define a WSI graph that encodes both +local and non-local dependencies between tumor regions and leverage graph +attention convolution to derive contextual patch features. Second, we introduce +a novel ordinal ranking constraint on the patch attention network to ensure +that higher-grade tumor regions are assigned higher attention. Third, we use +tumor depth as an auxiliary task to improve grade classification in a multitask +learning framework. RACR-MIL achieves 2-9% improvement in grade classification +over existing weakly-supervised approaches on a dataset of 718 cSCC tissue +images and localizes the tumor better. The model achieves 5-20% higher accuracy +in difficult-to-classify high-risk grade classes and is robust to class +imbalance. + +
+
+ comment: 7 pages main text, 2 page references, 3 page appendix; submitted to + AAAI +
+
+
+
+
+ + ☆ Everything Perturbed All at Once: Enabling Differentiable Graph Attacks + + +
+ As powerful tools for representation learning on graphs, graph neural +networks (GNNs) have played an important role in applications including social +networks, recommendation systems, and online web services. However, GNNs have +been shown to be vulnerable to adversarial attacks, which can significantly +degrade their effectiveness. Recent state-of-the-art approaches in adversarial +attacks rely on gradient-based meta-learning to selectively perturb a single +edge with the highest attack score until they reach the budget constraint. +While effective in identifying vulnerable links, these methods are plagued by +high computational costs. By leveraging continuous relaxation and +parameterization of the graph structure, we propose a novel attack method +called Differentiable Graph Attack (DGA) to efficiently generate effective +attacks and meanwhile eliminate the need for costly retraining. Compared to the +state-of-the-art, DGA achieves nearly equivalent attack performance with 6 +times less training time and 11 times smaller GPU memory footprint on different +benchmark datasets. Additionally, we provide extensive experimental analyses of +the transferability of the DGA among different graph models, as well as its +robustness against widely-used defense mechanisms. + +
+
+
+
+
+ + ☆ Mixed Variational Flows for Discrete Variables + + +
+ Variational flows allow practitioners to learn complex continuous +distributions, but approximating discrete distributions remains a challenge. +Current methodologies typically embed the discrete target in a continuous space +- usually via continuous relaxation or dequantization - and then apply a +continuous flow. These approaches involve a surrogate target that may not +capture the original discrete target, might have biased or unstable gradients, +and can create a difficult optimization problem. In this work, we develop a +variational flow family for discrete distributions without any continuous +embedding. First, we develop a measure-preserving and discrete (MAD) invertible +map that leaves the discrete target invariant, and then create a mixed +variational flow (MAD Mix) based on that map. We also develop an extension to +MAD Mix that handles joint discrete and continuous models. Our experiments +suggest that MAD Mix produces more reliable approximations than +continuous-embedding flows while being significantly faster to train. + +
+
+
+
+
+ + ☆ InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning + + +
+ One-Shot Neural Architecture Search (NAS) algorithms often rely on training a +hardware agnostic super-network for a domain specific task. Optimal +sub-networks are then extracted from the trained super-network for different +hardware platforms. However, training super-networks from scratch can be +extremely time consuming and compute intensive especially for large models that +rely on a two-stage training process of pre-training and fine-tuning. State of +the art pre-trained models are available for a wide range of tasks, but their +large sizes significantly limits their applicability on various hardware +platforms. We propose InstaTune, a method that leverages off-the-shelf +pre-trained weights for large models and generates a super-network during the +fine-tuning stage. InstaTune has multiple benefits. Firstly, since the process +happens during fine-tuning, it minimizes the overall time and compute resources +required for NAS. Secondly, the sub-networks extracted are optimized for the +target task, unlike prior work that optimizes on the pre-training objective. +Finally, InstaTune is easy to "plug and play" in existing frameworks. By using +multi-objective evolutionary search algorithms along with lightly trained +predictors, we find Pareto-optimal sub-networks that outperform their +respective baselines across different performance objectives such as accuracy +and MACs. Specifically, we demonstrate that our approach performs well across +both unimodal (ViT and BERT) and multi-modal (BEiT-3) transformer based +architectures. + +
+
+
+
+
+ + ☆ Measurement Tampering Detection Benchmark + + +
+ When training powerful AI systems to perform complex tasks, it may be +challenging to provide training signals which are robust to optimization. One +concern is measurement tampering, where the AI system manipulates multiple +measurements to create the illusion of good results instead of achieving the +desired outcome. In this work, we build four new text-based datasets to +evaluate measurement tampering detection techniques on large language models. +Concretely, given sets of text inputs and measurements aimed at determining if +some outcome occurred, as well as a base model able to accurately predict +measurements, the goal is to determine if examples where all measurements +indicate the outcome actually had the outcome occur, or if this was caused by +measurement tampering. We demonstrate techniques that outperform simple +baselines on most datasets, but don't achieve maximum performance. We believe +there is significant room for improvement for both techniques and datasets, and +we are excited for future work tackling measurement tampering. + +
+
+
+
+
+ + ☆ An Experimental Comparison of Partitioning Strategies for Distributed + Graph Neural Network Training + + +
+ Recently, graph neural networks (GNNs) have gained much attention as a +growing area of deep learning capable of learning on graph-structured data. +However, the computational and memory requirements for training GNNs on +large-scale graphs can exceed the capabilities of single machines or GPUs, +making distributed GNN training a promising direction for large-scale GNN +training. A prerequisite for distributed GNN training is to partition the input +graph into smaller parts that are distributed among multiple machines of a +compute cluster. Although graph partitioning has been extensively studied with +regard to graph analytics and graph databases, its effect on GNN training +performance is largely unexplored. + In this paper, we study the effectiveness of graph partitioning for +distributed GNN training. Our study aims to understand how different factors +such as GNN parameters, mini-batch size, graph type, features size, and +scale-out factor influence the effectiveness of graph partitioning. We conduct +experiments with two different GNN systems using vertex and edge partitioning. +We found that graph partitioning is a crucial pre-processing step that can +heavily reduce the training time and memory footprint. Furthermore, our results +show that invested partitioning time can be amortized by reduced GNN training, +making it a relevant optimization. + +
+
+
+
+
+ + ☆ Can transformers learn the greatest common divisor? + + +
+ I investigate the capability of small transformers to compute the greatest +common divisor (GCD) of two positive integers. When the training distribution +and the representation base are carefully chosen, models achieve 98% accuracy +and correctly predict 91 of the 100 first GCD. Model predictions are +deterministic and fully interpretable. During training, the models learn to +cluster input pairs with the same GCD, and classify them by their divisors. +Basic models, trained from uniform operands encoded on small bases, only +compute a handful of GCD (up to 38 out of 100): the products of divisors of the +base. Longer training and larger bases allow some models to "grok" small prime +GCD. Training from log-uniform operands boosts performance to 73 correct GCD, +and balancing the training distribution of GCD, from inverse square to +log-uniform, to 91 GCD. Training models from a uniform distribution of GCD +breaks the deterministic model behavior. + +
+
+
+
+
+ + ☆ Prototype Fission: Closing Set for Robust Open-set Semi-supervised + Learning + + +
+ Semi-supervised Learning (SSL) has been proven vulnerable to +out-of-distribution (OOD) samples in realistic large-scale unsupervised +datasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A +key underlying problem is class-wise latent space spreading from closed seen +space to open unseen space, and the bias is further magnified in SSL's +self-training loops. To close the ID distribution set so that OODs are better +rejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise +latent spaces into compact sub-spaces by automatic fine-grained latent space +mining, driven by coarse-grained labels only. Specifically, we form multiple +unique learnable sub-class prototypes for each class, optimized towards both +diversity and consistency. The Diversity Modeling term encourages samples to be +clustered by one of the multiple sub-class prototypes, while the Consistency +Modeling term clusters all samples of the same class to a global prototype. +Instead of "opening set", i.e., modeling OOD distribution, Prototype Fission +"closes set" and makes it hard for OOD samples to fit in sub-class latent +space. Therefore, PF is compatible with existing methods for further +performance gains. Extensive experiments validate the effectiveness of our +method in open-set SSL settings in terms of successfully forming sub-classes, +discriminating OODs from IDs and improving overall accuracy. Codes will be +released. + +
+
+
+
+
+ + ☆ Learning Sequential Information in Task-based fMRI for Synthetic Data + Augmentation MICCAI + + +
+ Insufficiency of training data is a persistent issue in medical image +analysis, especially for task-based functional magnetic resonance images (fMRI) +with spatio-temporal imaging data acquired using specific cognitive tasks. In +this paper, we propose an approach for generating synthetic fMRI sequences that +can then be used to create augmented training datasets in downstream learning +tasks. To synthesize high-resolution task-specific fMRI, we adapt the +$\alpha$-GAN structure, leveraging advantages of both GAN and variational +autoencoder models, and propose different alternatives in aggregating temporal +information. The synthetic images are evaluated from multiple perspectives +including visualizations and an autism spectrum disorder (ASD) classification +task. The results show that the synthetic task-based fMRI can provide effective +data augmentation in learning the ASD classification task. + +
+
+ comment: Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI + workshop), preprint version +
+
+
+
+
+ + ☆ Glocal Explanations of Expected Goal Models in Soccer + + +
+ The expected goal models have gained popularity, but their interpretability +is often limited, especially when trained using black-box methods. Explainable +artificial intelligence tools have emerged to enhance model transparency and +extract descriptive knowledge for a single observation or for all observations. +However, explaining black-box models for a specific group of observations may +be more useful in some domains. This paper introduces the glocal explanations +(between local and global levels) of the expected goal models to enable +performance analysis at the team and player levels by proposing the use of +aggregated versions of the SHAP values and partial dependence profiles. This +allows knowledge to be extracted from the expected goal model for a player or +team rather than just a single shot. In addition, we conducted real-data +applications to illustrate the usefulness of aggregated SHAP and aggregated +profiles. The paper concludes with remarks on the potential of these +explanations for performance analysis in soccer analytics. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ☆ Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster + Analysis + + +
+ We introduce usage of a reduction property of penalty-based formulation of +pseudo-Boolean polynomials as a mechanism for invariant dimensionality +reduction in cluster analysis processes. In our experiments, we show that +multidimensional data, like 4-dimensional Iris Flower dataset can be reduced to +2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer +(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or +planes that lie between reduced samples we can extract clusters in a linear and +unbiased manner with competitive accuracies, reproducibility and clear +interpretation. + +
+
+ comment: 14 pages, 4 figures, submitted to the International Conference Data + Analysis, Optimization and Their Applications on the Occasion of Boris + Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region, + Moscow Institute of Physics and Technology + https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php +
+
+
+
+
+ + ☆ Pure Exploration under Mediators' Feedback + + +
+ Stochastic multi-armed bandits are a sequential-decision-making framework, +where, at each interaction step, the learner selects an arm and observes a +stochastic reward. Within the context of best-arm identification (BAI) +problems, the goal of the agent lies in finding the optimal arm, i.e., the one +with highest expected reward, as accurately and efficiently as possible. +Nevertheless, the sequential interaction protocol of classical BAI problems, +where the agent has complete control over the arm being pulled at each round, +does not effectively model several decision-making problems of interest (e.g., +off-policy learning, partially controllable environments, and human feedback). +For this reason, in this work, we propose a novel strict generalization of the +classical BAI problem that we refer to as best-arm identification under +mediators' feedback (BAI-MF). More specifically, we consider the scenario in +which the learner has access to a set of mediators, each of which selects the +arms on the agent's behalf according to a stochastic and possibly unknown +policy. The mediator, then, communicates back to the agent the pulled arm +together with the observed reward. In this setting, the agent's goal lies in +sequentially choosing which mediator to query to identify with high probability +the optimal arm while minimizing the identification time, i.e., the sample +complexity. To this end, we first derive and analyze a statistical lower bound +on the sample complexity specific to our general mediator feedback scenario. +Then, we propose a sequential decision-making strategy for discovering the best +arm under the assumption that the mediators' policies are known to the learner. +As our theory verifies, this algorithm matches the lower bound both almost +surely and in expectation. Finally, we extend these results to cases where the +mediators' policies are unknown to the learner obtaining comparable results. + +
+
+
+
+
+ + ☆ Adversarial Style Transfer for Robust Policy Optimization in Deep + Reinforcement Learning + + +
+ This paper proposes an algorithm that aims to improve generalization for +reinforcement learning agents by removing overfitting to confounding features. +Our approach consists of a max-min game theoretic objective. A generator +transfers the style of observation during reinforcement learning. An additional +goal of the generator is to perturb the observation, which maximizes the +agent's probability of taking a different action. In contrast, a policy network +updates its parameters to minimize the effect of such perturbations, thus +staying robust while maximizing the expected future reward. Based on this +setup, we propose a practical deep reinforcement learning algorithm, +Adversarial Robust Policy Optimization (ARPO), to find a robust policy that +generalizes to unseen environments. We evaluate our approach on Procgen and +Distracting Control Suite for generalization and sample efficiency. +Empirically, ARPO shows improved performance compared to a few baseline +algorithms, including data augmentation. + +
+
+
+
+
+ + ☆ Tuning the perplexity for and computing sampling-based t-SNE embeddings + + +
+ Widely used pipelines for the analysis of high-dimensional data utilize +two-dimensional visualizations. These are created, e.g., via t-distributed +stochastic neighbor embedding (t-SNE). When it comes to large data sets, +applying these visualization techniques creates suboptimal embeddings, as the +hyperparameters are not suitable for large data. Cranking up these parameters +usually does not work as the computations become too expensive for practical +workflows. In this paper, we argue that a sampling-based embedding approach can +circumvent these problems. We show that hyperparameters must be chosen +carefully, depending on the sampling rate and the intended final embedding. +Further, we show how this approach speeds up the computation and increases the +quality of the embeddings. + +
+
+
+
+
+ + ♻ ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this +gap and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT ADA without specific guidance. ChatGPT ADA autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT ADA offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ♻ ☆ Fairness-aware Vision Transformer via Debiased Self-Attention + + +
+ Vision Transformer (ViT) has recently gained significant interest in solving +computer vision (CV) problems due to its capability of extracting informative +features and modeling long-range dependencies through the self-attention +mechanism. To fully realize the advantages of ViT in real-world applications, +recent works have explored the trustworthiness of ViT, including its robustness +and explainability. However, another desiderata, fairness has not yet been +adequately addressed in the literature. We establish that the existing +fairness-aware algorithms (primarily designed for CNNs) do not perform well on +ViT. This necessitates the need for developing our novel framework via Debiased +Self-Attention (DSA). DSA is a fairness-through-blindness approach that +enforces ViT to eliminate spurious features correlated with the sensitive +attributes for bias mitigation. Notably, adversarial examples are leveraged to +locate and mask the spurious features in the input image patches. In addition, +DSA utilizes an attention weights alignment regularizer in the training +objective to encourage learning informative features for target prediction. +Importantly, our DSA framework leads to improved fairness guarantees over prior +works on multiple prediction tasks without compromising target prediction +performance. + +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a meticulously curated, comprehensive +instruction dataset expressly designed for the biomolecular realm. +Mol-Instructions is composed of three pivotal components: molecule-oriented +instructions, protein-oriented instructions, and biomolecular text +instructions, each curated to enhance the understanding and prediction +capabilities of LLMs concerning biomolecular features and behaviors. Through +extensive instruction tuning experiments on the representative LLM, we +underscore the potency of Mol-Instructions to enhance the adaptability and +cognitive acuity of large models within the complex sphere of biomolecular +studies, thereby promoting advancements in the biomolecular research community. +Mol-Instructions is made publicly accessible for future research endeavors and +will be subjected to continual updates for enhanced applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions. Add + quantitative evaluations +
+
+
+
+
+ + ♻ ☆ An Empirical Investigation of the Role of Pre-training in Lifelong + Learning + + +
+ The lifelong learning paradigm in machine learning is an attractive +alternative to the more prominent isolated learning scheme not only due to its +resemblance to biological learning but also its potential to reduce energy +waste by obviating excessive model re-training. A key challenge to this +paradigm is the phenomenon of catastrophic forgetting. With the increasing +popularity and success of pre-trained models in machine learning, we pose the +question: What role does pre-training play in lifelong learning, specifically +with respect to catastrophic forgetting? We investigate existing methods in the +context of large, pre-trained models and evaluate their performance on a +variety of text and image classification tasks, including a large-scale study +using a novel data set of 15 diverse NLP tasks. Across all settings, we observe +that generic pre-training implicitly alleviates the effects of catastrophic +forgetting when learning multiple tasks sequentially compared to randomly +initialized models. We then further investigate why pre-training alleviates +forgetting in this setting. We study this phenomenon by analyzing the loss +landscape, finding that pre-trained weights appear to ease forgetting by +leading to wider minima. Based on this insight, we propose jointly optimizing +for current task loss and loss basin sharpness to explicitly encourage wider +basins during sequential fine-tuning. We show that this optimization approach +outperforms several state-of-the-art task-sequential continual learning +algorithms across multiple settings, occasionally even without retaining a +memory that scales in size with the number of tasks. + +
+
+
+
+
+ + ♻ ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ♻ ☆ inTformer: A Time-Embedded Attention-Based Transformer for Crash + Likelihood Prediction at Intersections Using Connected Vehicle Data + + +
+ The real-time crash likelihood prediction model is an essential component of +the proactive traffic safety management system. Over the years, numerous +studies have attempted to construct a crash likelihood prediction model in +order to enhance traffic safety, but mostly on freeways. In the majority of the +existing studies, researchers have primarily employed a deep learning-based +framework to identify crash potential. Lately, Transformer has emerged as a +potential deep neural network that fundamentally operates through +attention-based mechanisms. Transformer has several functional benefits over +extant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can +readily handle long-term dependencies in a data sequence. Secondly, +Transformers can parallelly process all elements in a data sequence during +training. Finally, a Transformer does not have the vanishing gradient issue. +Realizing the immense possibility of Transformers, this paper proposes +inTersection-Transformer (inTformer), a time-embedded attention-based +Transformer model that can effectively predict intersection crash likelihood in +real-time. The proposed model was evaluated using connected vehicle data +extracted from Signal Analytics Platform. Acknowledging the complex traffic +operation mechanism at intersection, this study developed zone-specific models +by dividing the intersection region into two distinct zones: +within-intersection and approach zone. The best inTformer models in +'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and +70%, respectively. The zone-level models were also compared to earlier studies +on crash likelihood prediction at intersections and with several established +deep learning models trained on the same connected vehicle dataset. + +
+
+ comment: 29 pages, 10 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ A Bayesian Framework for Digital Twin-Based Control, Monitoring, and + Data Collection in Wireless Systems + + +
+ Commonly adopted in the manufacturing and aerospace sectors, digital twin +(DT) platforms are increasingly seen as a promising paradigm to control, +monitor, and analyze software-based, "open", communication systems. Notably, DT +platforms provide a sandbox in which to test artificial intelligence (AI) +solutions for communication systems, potentially reducing the need to collect +data and test algorithms in the field, i.e., on the physical twin (PT). A key +challenge in the deployment of DT systems is to ensure that virtual control +optimization, monitoring, and analysis at the DT are safe and reliable, +avoiding incorrect decisions caused by "model exploitation". To address this +challenge, this paper presents a general Bayesian framework with the aim of +quantifying and accounting for model uncertainty at the DT that is caused by +limitations in the amount and quality of data available at the DT from the PT. +In the proposed framework, the DT builds a Bayesian model of the communication +system, which is leveraged to enable core DT functionalities such as control +via multi-agent reinforcement learning (MARL), monitoring of the PT for anomaly +detection, prediction, data-collection optimization, and counterfactual +analysis. To exemplify the application of the proposed framework, we +specifically investigate a case-study system encompassing multiple sensing +devices that report to a common receiver. Experimental results validate the +effectiveness of the proposed Bayesian framework as compared to standard +frequentist model-based solutions. + +
+
+ comment: Accepted for publication in IEEE Journal on Selected Areas in + Communications ; Extends and subsumes arXiv:2210.05582 ; Updates: - + 18/01/2023: Updated reference ; - 29/08/2023: Revised manuscript version +
+
+
+
+
+ + ♻ ☆ Investigating Reproducibility at Interspeech Conferences: A Longitudinal + and Comparative Perspective + + +
+ Reproducibility is a key aspect for scientific advancement across +disciplines, and reducing barriers for open science is a focus area for the +theme of Interspeech 2023. Availability of source code is one of the indicators +that facilitates reproducibility. However, less is known about the rates of +reproducibility at Interspeech conferences in comparison to other conferences +in the field. In order to fill this gap, we have surveyed 27,717 papers at +seven conferences across speech and language processing disciplines. We find +that despite having a close number of accepted papers to the other conferences, +Interspeech has up to 40% less source code availability. In addition to +reporting the difficulties we have encountered during our research, we also +provide recommendations and possible directions to increase reproducibility for +further studies. + +
+
+
+
+
+ + ♻ torchgfn: A PyTorch GFlowNet library + + +
+ The growing popularity of generative flow networks (GFlowNets or GFNs) from a +range of researchers with diverse backgrounds and areas of expertise +necessitates a library which facilitates the testing of new features such as +training losses that can be easily compared to standard benchmark +implementations, or on a set of common environments. torchgfn is a PyTorch +library that aims to address this need. It provides users with a simple API for +environments and useful abstractions for samplers and losses. Multiple examples +are provided, replicating and unifying published results. The code is available +in https://github.com/saleml/torchgfn. + +
+
+
+
+
+ + ♻ ☆ Application Performance Modeling via Tensor Completion + + +
+ Performance tuning, software/hardware co-design, and job scheduling are among +the many tasks that rely on models to predict application performance. We +propose and evaluate low-rank tensor decomposition for modeling application +performance. We discretize the input and configuration domains of an +application using regular grids. Application execution times mapped within +grid-cells are averaged and represented by tensor elements. We show that +low-rank canonical-polyadic (CP) tensor decomposition is effective in +approximating these tensors. We further show that this decomposition enables +accurate extrapolation of unobserved regions of an application's parameter +space. We then employ tensor completion to optimize a CP decomposition given a +sparse set of observed execution times. We consider alternative +piecewise/grid-based models and supervised learning models for six applications +and demonstrate that CP decomposition optimized using tensor completion offers +higher prediction accuracy and memory-efficiency for high-dimensional +performance modeling. + +
+
+
+
+
+ + ♻ ☆ Learning Bayesian Networks with Heterogeneous Agronomic Data Sets via + Mixed-Effect Models and Hierarchical Clustering + + +
+ Research involving diverse but related data sets, where associations between +covariates and outcomes may vary, is prevalent in various fields including +agronomic studies. In these scenarios, hierarchical models, also known as +multilevel models, are frequently employed to assimilate information from +different data sets while accommodating their distinct characteristics. +However, their structure extend beyond simple heterogeneity, as variables often +form complex networks of causal relationships. + Bayesian networks (BNs) provide a powerful framework for modelling such +relationships using directed acyclic graphs to illustrate the connections +between variables. This study introduces a novel approach that integrates +random effects into BN learning. Rooted in linear mixed-effects models, this +approach is particularly well-suited for handling hierarchical data. Results +from a real-world agronomic trial suggest that employing this approach enhances +structural learning, leading to the discovery of new connections and the +improvement of improved model specification. Furthermore, we observe a +reduction in prediction errors from 28% to 17%. By extending the applicability +of BNs to complex data set structures, this approach contributes to the +effective utilisation of BNs for hierarchical agronomic data. This, in turn, +enhances their value as decision-support tools in the field. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Quantifying Causes of Arctic Amplification via Deep Learning based + Time-series Causal Inference + + +
+ The warming of the Arctic, also known as Arctic amplification, is led by +several atmospheric and oceanic drivers. However, the details of its underlying +thermodynamic causes are still unknown. Inferring the causal effects of +atmospheric processes on sea ice melt using fixed treatment effect strategies +leads to unrealistic counterfactual estimations. Such models are also prone to +bias due to time-varying confoundedness. Further, the complex non-linearity in +Earth science data makes it infeasible to perform causal inference using +existing marginal structural techniques. In order to tackle these challenges, +we propose TCINet - time-series causal inference model to infer causation under +continuous treatment using recurrent neural networks and a novel probabilistic +balancing technique. Through experiments on synthetic and observational data, +we show how our research can substantially improve the ability to quantify +leading causes of Arctic sea ice melt, further paving paths for causal +inference in observational Earth science. + +
+
+
+
+
+ + ♻ ☆ Bayesian Feature Selection in Joint Quantile Time Series Analysis + + +
+ Quantile feature selection over correlated multivariate time series data has +always been a methodological challenge and is an open problem. In this paper, +we propose a general Bayesian dimension reduction methodology for feature +selection in high-dimensional joint quantile time series analysis, under the +name of the quantile feature selection time series (QFSTS) model. The QFSTS +model is a general structural time series model, where each component yields an +additive contribution to the time series modeling with direct interpretations. +Its flexibility is compound in the sense that users can add/deduct components +for each time series and each time series can have its own specific valued +components of different sizes. Feature selection is conducted in the quantile +regression component, where each time series has its own pool of +contemporaneous external predictors allowing nowcasting. Bayesian methodology +in extending feature selection to the quantile time series research area is +developed using multivariate asymmetric Laplace distribution, spike-and-slab +prior setup, the Metropolis-Hastings algorithm, and the Bayesian model +averaging technique, all implemented consistently in the Bayesian paradigm. The +QFSTS model requires small datasets to train and converges fast. Extensive +examinations confirmed that the QFSTS model has superior performance in feature +selection, parameter estimation, and forecast. + +
+
+ comment: Accepted to the Bayesian Analysis journal +
+
+
+
+
+ + ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly + Identification + + +
+ Failure to recognize samples from the classes unseen during training is a +major limitation of artificial intelligence in the real-world implementation +for recognition and classification of retinal anomalies. We established an +uncertainty-inspired open-set (UIOS) model, which was trained with fundus +images of 9 retinal conditions. Besides assessing the probability of each +category, UIOS also calculated an uncertainty score to express its confidence. +Our UIOS model with thresholding strategy achieved an F1 score of 99.55%, +97.01% and 91.91% for the internal testing set, external target categories +(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1 +score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS +correctly predicted high uncertainty scores, which would prompt the need for a +manual check in the datasets of non-target categories retinal diseases, +low-quality fundus images, and non-fundus images. UIOS provides a robust method +for real-world screening of retinal anomalies. + +
+
+
+
+
+ + ♻ ☆ Combinatorial Pure Exploration with Full-bandit Feedback and Beyond: + Solving Combinatorial Optimization under Uncertainty with Limited Observation + + +
+ Combinatorial optimization is one of the fundamental research fields that has +been extensively studied in theoretical computer science and operations +research. When developing an algorithm for combinatorial optimization, it is +commonly assumed that parameters such as edge weights are exactly known as +inputs. However, this assumption may not be fulfilled since input parameters +are often uncertain or initially unknown in many applications such as +recommender systems, crowdsourcing, communication networks, and online +advertisement. To resolve such uncertainty, the problem of combinatorial pure +exploration of multi-armed bandits (CPE) and its variants have recieved +increasing attention. Earlier work on CPE has studied the semi-bandit feedback +or assumed that the outcome from each individual edge is always accessible at +all rounds. However, due to practical constraints such as a budget ceiling or +privacy concern, such strong feedback is not always available in recent +applications. In this article, we review recently proposed techniques for +combinatorial pure exploration problems with limited feedback. + +
+
+ comment: Preprint of an Invited Review Article, In Fields Institute +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption + overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet + results in Section 4.3 (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Deep Learning Based Residuals in Non-linear Factor Models: Precision + Matrix Estimation of Returns with Low Signal-to-Noise Ratio + + +
+ This paper introduces a consistent estimator and rate of convergence for the +precision matrix of asset returns in large portfolios using a non-linear factor +model within the deep learning framework. Our estimator remains valid even in +low signal-to-noise ratio environments typical for financial markets and is +compatible with weak factors. Our theoretical analysis establishes uniform +bounds on expected estimation risk based on deep neural networks for an +expanding number of assets. Additionally, we provide a new consistent +data-dependent estimator of error covariance in deep neural networks. Our +models demonstrate superior accuracy in extensive simulations and the empirics. + +
+
+
+
+
+ + ♻ ☆ Atlas-Based Interpretable Age Prediction In Whole-Body MR Images + + +
+ Age prediction is an important part of medical assessments and research. It +can aid in detecting diseases as well as abnormal ageing by highlighting the +discrepancy between chronological and biological age. To gain a comprehensive +understanding of age-related changes observed in various body parts, we +investigate them on a larger scale by using whole-body images. We utilise the +Grad-CAM interpretability method to determine the body areas most predictive of +a person's age. We expand our analysis beyond individual subjects by employing +registration techniques to generate population-wide interpretability maps. +Furthermore, we set state-of-the-art whole-body age prediction with a model +that achieves a mean absolute error of 2.76 years. Our findings reveal three +primary areas of interest: the spine, the autochthonous back muscles, and the +cardiac region, which exhibits the highest importance. + +
+
+
+
+
+ + ♻ ☆ Generalized partitioned local depth + + +
+ In this paper we provide a generalization of the concept of cohesion as +introduced recently by Berenhaut, Moore and Melvin [Proceedings of the National +Academy of Sciences, 119 (4) (2022)]. The formulation presented builds on the +technique of partitioned local depth by distilling two key probabilistic +concepts: local relevance and support division. Earlier results are extended +within the new context, and examples of applications to revealing communities +in data with uncertainty are included. The work sheds light on the foundations +of partitioned local depth, and extends the original ideas to enable +probabilistic consideration of uncertain, variable and potentially conflicting +information. + +
+
+ comment: Improved exposition & motivation, references added, 19 pages, 6 + figures +
+
+
+
+
+ + ♻ ☆ An Analysis of Abstracted Model-Based Reinforcement Learning + + +
+ Many methods for Model-based Reinforcement learning (MBRL) in Markov decision +processes (MDPs) provide guarantees for both the accuracy of the model they can +deliver and the learning efficiency. At the same time, state abstraction +techniques allow for a reduction of the size of an MDP while maintaining a +bounded loss with respect to the original problem. Therefore, it may come as a +surprise that no such guarantees are available when combining both techniques, +i.e., where MBRL merely observes abstract states. Our theoretical analysis +shows that abstraction can introduce a dependence between samples collected +online (e.g., in the real world). That means that, without taking this +dependence into account, results for MBRL do not directly extend to this +setting. Our result shows that we can use concentration inequalities for +martingales to overcome this problem. This result makes it possible to extend +the guarantees of existing MBRL algorithms to the setting with abstraction. We +illustrate this by combining R-MAX, a prototypical MBRL algorithm, with +abstraction, thus producing the first performance guarantees for model-based +`RL from Abstracted Observations': model-based reinforcement learning with an +abstract model. + +
+
+ comment: 36 pages, 2 figures, submitted to TMLR +
+
+
+
+
+ + ♻ ☆ Strategic Coalition for Data Pricing in IoT Data Markets + + +
+ This paper considers a market for trading Internet of Things (IoT) data that +is used to train machine learning models. The data, either raw or processed, is +supplied to the market platform through a network and the price of such data is +controlled based on the value it brings to the machine learning model. We +explore the correlation property of data in a game-theoretical setting to +eventually derive a simplified distributed solution for a data trading +mechanism that emphasizes the mutual benefit of devices and the market. The key +proposal is an efficient algorithm for markets that jointly addresses the +challenges of availability and heterogeneity in participation, as well as the +transfer of trust and the economic value of data exchange in IoT networks. The +proposed approach establishes the data market by reinforcing collaboration +opportunities between device with correlated data to avoid information leakage. +Therein, we develop a network-wide optimization problem that maximizes the +social value of coalition among the IoT devices of similar data types; at the +same time, it minimizes the cost due to network externalities, i.e., the impact +of information leakage due to data correlation, as well as the opportunity +costs. Finally, we reveal the structure of the formulated problem as a +distributed coalition game and solve it following the simplified +split-and-merge algorithm. Simulation results show the efficacy of our proposed +mechanism design toward a trusted IoT data market, with up to 32.72% gain in +the average payoff for each seller. + +
+
+ comment: 15 pages. 12 figures. This paper has been accepted for publication in + IEEE Internet of Things Journal. Copyright may change without notice +
+
+
+
+
+ + ♻ ☆ Cyclic and Randomized Stepsizes Invoke Heavier Tails in SGD than + Constant Stepsize + + +
+ Cyclic and randomized stepsizes are widely used in the deep learning practice +and can often outperform standard stepsize choices such as constant stepsize in +SGD. Despite their empirical success, not much is currently known about when +and why they can theoretically improve the generalization performance. We +consider a general class of Markovian stepsizes for learning, which contain +i.i.d. random stepsize, cyclic stepsize as well as the constant stepsize as +special cases, and motivated by the literature which shows that heaviness of +the tails (measured by the so-called "tail-index") in the SGD iterates is +correlated with generalization, we study tail-index and provide a number of +theoretical results that demonstrate how the tail-index varies on the stepsize +scheduling. Our results bring a new understanding of the benefits of cyclic and +randomized stepsizes compared to constant stepsize in terms of the tail +behavior. We illustrate our theory on linear regression experiments and show +through deep learning experiments that Markovian stepsizes can achieve even a +heavier tail and be a viable alternative to cyclic and i.i.d. randomized +stepsize rules. + +
+
+ comment: To Appear +
+
+
+
+
+ + ♻ ☆ Explainable AI Insights for Symbolic Computation: A case study on + selecting the variable ordering for cylindrical algebraic decomposition + + +
+ In recent years there has been increased use of machine learning (ML) +techniques within mathematics, including symbolic computation where it may be +applied safely to optimise or select algorithms. This paper explores whether +using explainable AI (XAI) techniques on such ML models can offer new insight +for symbolic computation, inspiring new implementations within computer algebra +systems that do not directly call upon AI tools. We present a case study on the +use of ML to select the variable ordering for cylindrical algebraic +decomposition. It has already been demonstrated that ML can make the choice +well, but here we show how the SHAP tool for explainability can be used to +inform new heuristics of a size and complexity similar to those human-designed +heuristics currently commonly used in symbolic computation. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Towards an AI-enabled Connected Industry: AGV Communication and Sensor + Measurement Datasets + + +
+ This paper presents two wireless measurement campaigns in industrial +testbeds: industrial Vehicle-to-vehicle (iV2V) and industrial +Vehicle-to-infrastructure plus Sensor (iV2I+), together with detailed +information about the two captured datasets. iV2V covers sidelink communication +scenarios between Automated Guided Vehicles (AGVs), while iV2I+ is conducted at +an industrial setting where an autonomous cleaning robot is connected to a +private cellular network. The combination of different communication +technologies within a common measurement methodology provides insights that can +be exploited by Machine Learning (ML) for tasks such as fingerprinting, +line-of-sight detection, prediction of quality of service or link selection. +Moreover, the datasets are publicly available, labelled and prefiltered for +fast on-boarding and applicability. + +
+
+ comment: 7 pages, 3 figures. Submitted to a magazine. Datasets available at + https://ieee-dataport.org/open-access/ai4mobile-industrial-wireless-datasets-iv2v-and-iv2i +
+
+
+
+
+ + ♻ ☆ Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation + for Pretrained Deep Generative Model CVPR2023 + + +
+ Semantic editing of images is the fundamental goal of computer vision. +Although deep learning methods, such as generative adversarial networks (GANs), +are capable of producing high-quality images, they often do not have an +inherent way of editing generated images semantically. Recent studies have +investigated a way of manipulating the latent variable to determine the images +to be generated. However, methods that assume linear semantic arithmetic have +certain limitations in terms of the quality of image editing, whereas methods +that discover nonlinear semantic pathways provide non-commutative editing, +which is inconsistent when applied in different orders. This study proposes a +novel method called deep curvilinear editing (DeCurvEd) to determine semantic +commuting vector fields on the latent space. We theoretically demonstrate that +owing to commutativity, the editing of multiple attributes depends only on the +quantities and not on the order. Furthermore, we experimentally demonstrate +that compared to previous methods, the nonlinear and commutative nature of +DeCurvEd facilitates the disentanglement of image attributes and provides +higher-quality editing. + +
+
+ comment: 15 pages. The last update made no changes except for adding the + following link to the CVF repository: + https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html. + Here, you can find our code to reproduce our results +
+
+
+
+
+ + ♻ ☆ Recurrent segmentation meets block models in temporal networks + + +
+ A popular approach to model interactions is to represent them as a network +with nodes being the agents and the interactions being the edges. Interactions +are often timestamped, which leads to having timestamped edges. Many real-world +temporal networks have a recurrent or possibly cyclic behaviour. For example, +social network activity may be heightened during certain hours of day. In this +paper, our main interest is to model recurrent activity in such temporal +networks. As a starting point we use stochastic block model, a popular choice +for modelling static networks, where nodes are split into $R$ groups. We extend +this model to temporal networks by modelling the edges with a Poisson process. +We make the parameters of the process dependent on time by segmenting the time +line into $K$ segments. To enforce the recurring activity we require that only +$H < K$ different set of parameters can be used, that is, several, not +necessarily consecutive, segments must share their parameters. We prove that +the searching for optimal blocks and segmentation is an NP-hard problem. +Consequently, we split the problem into 3 subproblems where we optimize blocks, +model parameters, and segmentation in turn while keeping the remaining +structures fixed. We propose an iterative algorithm that requires $O(KHm + Rn + +R^2H)$ time per iteration, where $n$ and $m$ are the number of nodes and edges +in the network. We demonstrate experimentally that the number of required +iterations is typically low, the algorithm is able to discover the ground truth +from synthetic datasets, and show that certain real-world networks exhibit +recurrent behaviour as the likelihood does not deteriorate when $H$ is lowered. + +
+
+
+
+
+ + ♻ ☆ Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR + Semantic Segmentation ICCV 2023 + + +
+ The ability to deploy robots that can operate safely in diverse environments +is crucial for developing embodied intelligent agents. As a community, we have +made tremendous progress in within-domain LiDAR semantic segmentation. However, +do these methods generalize across domains? To answer this question, we design +the first experimental setup for studying domain generalization (DG) for LiDAR +semantic segmentation (DG-LSS). Our results confirm a significant gap between +methods, evaluated in a cross-domain setting: for example, a model trained on +the source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data, +compared to $48.49$ mIoU obtained by the model trained on the target domain +(nuScenes). To tackle this gap, we propose the first method specifically +designed for DG-LSS, which obtains $34.88$ mIoU on the target domain, +outperforming all baselines. Our method augments a sparse-convolutional +encoder-decoder 3D segmentation network with an additional, dense 2D +convolutional decoder that learns to classify a birds-eye view of the point +cloud. This simple auxiliary task encourages the 3D network to learn features +that are robust to sensor placement shifts and resolution, and are transferable +across domains. With this work, we aim to inspire the community to develop and +evaluate future models in such cross-domain conditions. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Physics-informed neural networks for solving forward and inverse + problems in complex beam systems + + +
+ This paper proposes a new framework using physics-informed neural networks +(PINNs) to simulate complex structural systems that consist of single and +double beams based on Euler-Bernoulli and Timoshenko theory, where the double +beams are connected with a Winkler foundation. In particular, forward and +inverse problems for the Euler-Bernoulli and Timoshenko partial differential +equations (PDEs) are solved using nondimensional equations with the +physics-informed loss function. Higher-order complex beam PDEs are efficiently +solved for forward problems to compute the transverse displacements and +cross-sectional rotations with less than 1e-3 percent error. Furthermore, +inverse problems are robustly solved to determine the unknown dimensionless +model parameters and applied force in the entire space-time domain, even in the +case of noisy data. The results suggest that PINNs are a promising strategy for +solving problems in engineering structures and machines involving beam systems. + +
+
+
+
+
+ + ♻ ☆ A Survey of Imbalanced Learning on Graphs: Problems, Techniques, and + Future Directions + + +
+ Graphs represent interconnected structures prevalent in a myriad of +real-world scenarios. Effective graph analytics, such as graph learning +methods, enables users to gain profound insights from graph data, underpinning +various tasks including node classification and link prediction. However, these +methods often suffer from data imbalance, a common issue in graph data where +certain segments possess abundant data while others are scarce, thereby leading +to biased learning outcomes. This necessitates the emerging field of imbalanced +learning on graphs, which aims to correct these data distribution skews for +more accurate and representative learning outcomes. In this survey, we embark +on a comprehensive review of the literature on imbalanced learning on graphs. +We begin by providing a definitive understanding of the concept and related +terminologies, establishing a strong foundational understanding for readers. +Following this, we propose two comprehensive taxonomies: (1) the problem +taxonomy, which describes the forms of imbalance we consider, the associated +tasks, and potential solutions; (2) the technique taxonomy, which details key +strategies for addressing these imbalances, and aids readers in their method +selection process. Finally, we suggest prospective future directions for both +problems and techniques within the sphere of imbalanced learning on graphs, +fostering further innovation in this critical area. + +
+
+ comment: The collection of awesome literature on imbalanced learning on + graphs: https://github.com/Xtra-Computing/Awesome-Literature-ILoGs +
+
+
+
+
+ + ♻ ☆ Policy Gradient for Reinforcement Learning with General Utilities + + +
+ In Reinforcement Learning (RL), the goal of agents is to discover an optimal +policy that maximizes the expected cumulative rewards. This objective may also +be viewed as finding a policy that optimizes a linear function of its +state-action occupancy measure, hereafter referred as Linear RL. However, many +supervised and unsupervised RL problems are not covered in the Linear RL +framework, such as apprenticeship learning, pure exploration and variational +intrinsic control, where the objectives are non-linear functions of the +occupancy measures. RL with non-linear utilities looks unwieldy, as methods +like Bellman equation, value iteration, policy gradient, dynamic programming +that had tremendous success in Linear RL, fail to trivially generalize. In this +paper, we derive the policy gradient theorem for RL with general utilities. The +policy gradient theorem proves to be a cornerstone in Linear RL due to its +elegance and ease of implementability. Our policy gradient theorem for RL with +general utilities shares the same elegance and ease of implementability. Based +on the policy gradient theorem derived, we also present a simple sample-based +algorithm. We believe our results will be of interest to the community and +offer inspiration to future works in this generalized setting. + +
+
+
+
+
+ + ♻ ☆ Combining Primal and Dual Representations in Deep Restricted Kernel + Machines Classifiers + + +
+ In the context of deep learning with kernel machines, the deep Restricted +Kernel Machine (DRKM) framework allows multiple levels of kernel PCA (KPCA) and +Least-Squares Support Vector Machines (LSSVM) to be combined into a deep +architecture using visible and hidden units. We propose a new method for DRKM +classification coupling the objectives of KPCA and classification levels, with +the hidden feature matrix lying on the Stiefel manifold. The classification +level can be formulated as an LSSVM or as an MLP feature map, combining depth +in terms of levels and layers. The classification level is expressed in its +primal formulation, as the deep KPCA levels, in their dual formulation, can +embed the most informative components of the data in a much lower dimensional +space. The dual setting is independent of the dimension of the inputs and the +primal setting is parametric, which makes the proposed method computationally +efficient for both high-dimensional inputs and large datasets. In the +experiments, we show that our developed algorithm can effectively learn from +small datasets, while using less memory than the convolutional neural network +(CNN) with high-dimensional data. and that models with multiple KPCA levels can +outperform models with a single level. On the tested larger-scale datasets, +DRKM is more energy efficient than CNN while maintaining comparable +performance. + +
+
+
+
+
+ + ♻ ☆ Efficient Representation of Natural Image Patches + + +
+ In the complex domain of neural information processing, discerning +fundamental principles from ancillary details remains a significant challenge. +While there is extensive knowledge about the anatomy and physiology of the +early visual system, a comprehensive computational theory remains elusive. Can +we gain insights into the underlying principles of a biological system by +abstracting away from its detailed implementation and focusing on the +fundamental problems that the system is designed to solve? Utilizing an +abstract model based on minimal yet realistic assumptions, we show how to +achieve the early visual system's two ultimate objectives: efficient +information transmission and sensor probability distribution modeling. We show +that optimizing for information transmission does not yield optimal probability +distribution modeling. We illustrate, using a two-pixel (2D) system and image +patches, that an efficient representation can be realized via nonlinear +population code driven by two types of biologically plausible loss functions +that depend solely on output. After unsupervised learning, our abstract IPU +model bears remarkable resemblances to biological systems, despite not +mimicking many features of real neurons, such as spiking activity. A +preliminary comparison with a contemporary deep learning model suggests that +the IPU model offers a significant efficiency advantage. Our model provides +novel insights into the computational theory of early visual systems as well as +a potential new approach to enhance the efficiency of deep learning models. + +
+
+
+
+
+ + ♻ ☆ Incomplete Multi-View Weak-Label Learning with Noisy Features and + Imbalanced Labels + + +
+ A variety of modern applications exhibit multi-view multi-label learning, +where each sample has multi-view features, and multiple labels are correlated +via common views. Current methods usually fail to directly deal with the +setting where only a subset of features and labels are observed for each +sample, and ignore the presence of noisy views and imbalanced labels in +real-world problems. In this paper, we propose a novel method to overcome the +limitations. It jointly embeds incomplete views and weak labels into a +low-dimensional subspace with adaptive weights, and facilitates the difference +between embedding weight matrices via auto-weighted Hilbert-Schmidt +Independence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively +learns view-wise importance for embedding to detect noisy views, and mitigates +the label imbalance problem by focal loss. Experimental results on four +real-world multi-view multi-label datasets demonstrate the effectiveness of the +proposed method. + +
+
+ comment: 6 pages, 2 figures, conference +
+
+
+
+
+ + ♻ ☆ Multi-label Node Classification On Graph-Structured Data + + +
+ Graph Neural Networks (GNNs) have shown state-of-the-art improvements in node +classification tasks on graphs. While these improvements have been largely +demonstrated in a multi-class classification scenario, a more general and +realistic scenario in which each node could have multiple labels has so far +received little attention. The first challenge in conducting focused studies on +multi-label node classification is the limited number of publicly available +multi-label graph datasets. Therefore, as our first contribution, we collect +and release three real-world biological datasets and develop a multi-label +graph generator to generate datasets with tunable properties. While high label +similarity (high homophily) is usually attributed to the success of GNNs, we +argue that a multi-label scenario does not follow the usual semantics of +homophily and heterophily so far defined for a multi-class scenario. As our +second contribution, besides defining homophily for the multi-label scenario, +we develop a new approach that dynamically fuses the feature and label +correlation information to learn label-informed representations. Finally, we +perform a large-scale comparative study with $10$ methods and $9$ datasets +which also showcase the effectiveness of our approach. We release our benchmark +at \url{https://anonymous.4open.science/r/LFLF-5D8C/}. + +
+
+
+
+
+ + ♻ ☆ Improving Few-Shot Prompts with Relevant Static Analysis Products + + +
+ Large Language Models (LLM) are a new class of computation engines, +"programmed" via prompt engineering. We are still learning how to best +"program" these LLMs to help developers. We start with the intuition that +developers tend to consciously and unconsciously have a collection of semantics +facts in mind when working on coding tasks. Mostly these are shallow, simple +facts arising from a quick read. For a function, examples of facts might +include parameter and local variable names, return expressions, simple pre- and +post-conditions, and basic control and data flow, etc. + One might assume that the powerful multi-layer architecture of +transformer-style LLMs makes them inherently capable of doing this simple level +of "code analysis" and extracting such information, implicitly, while +processing code: but are they, really? If they aren't, could explicitly adding +this information help? Our goal here is to investigate this question, using the +code summarization task and evaluate whether automatically augmenting an LLM's +prompt with semantic facts explicitly, actually helps. + Prior work shows that LLM performance on code summarization benefits from +few-shot samples drawn either from the same-project or from examples found via +information retrieval methods (such as BM25). While summarization performance +has steadily increased since the early days, there is still room for +improvement: LLM performance on code summarization still lags its performance +on natural-language tasks like translation and text summarization. + We find that adding semantic facts actually does help! This approach improves +performance in several different settings suggested by prior work, including +for two different Large Language Models. In most cases, improvement nears or +exceeds 2 BLEU; for the PHP language in the challenging CodeSearchNet dataset, +this augmentation actually yields performance surpassing 30 BLEU. + +
+
+
+
+
+ + ♻ ☆ AdaTerm: Adaptive T-Distribution Estimated Robust Moments for + Noise-Robust Stochastic Gradient Optimization + + +
+ With the increasing practicality of deep learning applications, practitioners +are inevitably faced with datasets corrupted by noise from various sources such +as measurement errors, mislabeling, and estimated surrogate inputs/outputs that +can adversely impact the optimization results. It is a common practice to +improve the optimization algorithm's robustness to noise, since this algorithm +is ultimately in charge of updating the network parameters. Previous studies +revealed that the first-order moment used in Adam-like stochastic gradient +descent optimizers can be modified based on the Student's t-distribution. While +this modification led to noise-resistant updates, the other associated +statistics remained unchanged, resulting in inconsistencies in the assumed +models. In this paper, we propose AdaTerm, a novel approach that incorporates +the Student's t-distribution to derive not only the first-order moment but also +all the associated statistics. This provides a unified treatment of the +optimization process, offering a comprehensive framework under the statistical +model of the t-distribution for the first time. The proposed approach offers +several advantages over previously proposed approaches, including reduced +hyperparameters and improved robustness and adaptability. This noise-adaptive +behavior contributes to AdaTerm's exceptional learning performance, as +demonstrated through various optimization problems with different and/or +unknown noise ratios. Furthermore, we introduce a new technique for deriving a +theoretical regret bound without relying on AMSGrad, providing a valuable +contribution to the field + +
+
+ comment: 27 pages; Final version accepted by Elsevier Neurocomputing Journal + (2023-08; https://doi.org/10.1016/j.neucom.2023.126692) +
+
+
+
+
+ + ♻ ☆ Ballistocardiogram artifact removal in simultaneous EEG-fMRI using + generative adversarial network + + +
+ Due to its advantages of high temporal and spatial resolution, the technology +of simultaneous electroencephalogram-functional magnetic resonance imaging +(EEG-fMRI) acquisition and analysis has attracted much attention, and has been +widely used in various research fields of brain science. However, during the +fMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate +the EEG. As an unpaired problem, BCG artifact removal now remains a +considerable challenge. Aiming to provide a solution, this paper proposed a +novel modular generative adversarial network (GAN) and corresponding training +strategy to improve the network performance by optimizing the parameters of +each module. In this manner, we hope to improve the local representation +ability of the network model, thereby improving its overall performance and +obtaining a reliable generator for BCG artifact removal. Moreover, the proposed +method does not rely on additional reference signal or complex hardware +equipment. Experimental results show that, compared with multiple methods, the +technique presented in this paper can remove the BCG artifact more effectively +while retaining essential EEG information. + +
+
+
+
+
+ + ♻ ☆ EquiDiff: A Conditional Equivariant Diffusion Model For Trajectory + Prediction + + +
+ Accurate trajectory prediction is crucial for the safe and efficient +operation of autonomous vehicles. The growing popularity of deep learning has +led to the development of numerous methods for trajectory prediction. While +deterministic deep learning models have been widely used, deep generative +models have gained popularity as they learn data distributions from training +data and account for trajectory uncertainties. In this study, we propose +EquiDiff, a deep generative model for predicting future vehicle trajectories. +EquiDiff is based on the conditional diffusion model, which generates future +trajectories by incorporating historical information and random Gaussian noise. +The backbone model of EquiDiff is an SO(2)-equivariant transformer that fully +utilizes the geometric properties of location coordinates. In addition, we +employ Recurrent Neural Networks and Graph Attention Networks to extract social +interactions from historical trajectories. To evaluate the performance of +EquiDiff, we conduct extensive experiments on the NGSIM dataset. Our results +demonstrate that EquiDiff outperforms other baseline models in short-term +prediction, but has slightly higher errors for long-term prediction. +Furthermore, we conduct an ablation study to investigate the contribution of +each component of EquiDiff to the prediction accuracy. Additionally, we present +a visualization of the generation process of our diffusion model, providing +insights into the uncertainty of the prediction. + +
+
+
+
+
+ + ♻ ☆ A Conditional Denoising Diffusion Probabilistic Model for Radio + Interferometric Image Reconstruction ECAI 2023 + + +
+ In radio astronomy, signals from radio telescopes are transformed into images +of observed celestial objects, or sources. However, these images, called dirty +images, contain real sources as well as artifacts due to signal sparsity and +other factors. Therefore, radio interferometric image reconstruction is +performed on dirty images, aiming to produce clean images in which artifacts +are reduced and real sources are recovered. So far, existing methods have +limited success on recovering faint sources, preserving detailed structures, +and eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and +Image Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to +use both the original visibility data in the spectral domain and dirty images +in the spatial domain to guide the image generation process with DDPM. This +way, we can leverage DDPM to generate fine details and eliminate noise, while +utilizing visibility data to separate signals from noise and retaining spatial +information in dirty images. We have conducted experiments in comparison with +both traditional methods and recent deep learning based approaches. Our results +show that our method significantly improves the resulting images by reducing +artifacts, preserving fine details, and recovering dim sources. This +advancement further facilitates radio astronomical data analysis tasks on +celestial phenomena. + +
+
+ comment: Accepted by ECAI 2023 +
+
+
+
+
+ + ♻ ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification + + +
+ With the growth of 3D sensing technology, deep learning system for 3D point +clouds has become increasingly important, especially in applications like +autonomous vehicles where safety is a primary concern. However, there are also +growing concerns about the reliability of these systems when they encounter +noisy point clouds, whether occurring naturally or introduced with malicious +intent. This paper highlights the challenges of point cloud classification +posed by various forms of noise, from simple background noise to malicious +backdoor attacks that can intentionally skew model predictions. While there's +an urgent need for optimized point cloud denoising, current point outlier +removal approaches, an essential step for denoising, rely heavily on +handcrafted strategies and are not adapted for higher-level tasks, such as +classification. To address this issue, we introduce an innovative point outlier +cleansing method that harnesses the power of downstream classification models. +By employing gradient-based attribution analysis, we define a novel concept: +point risk. Drawing inspiration from tail risk minimization in finance, we +recast the outlier removal process as an optimization problem, named PointCVaR. +Extensive experiments show that our proposed technique not only robustly +filters diverse point cloud outliers but also consistently and significantly +enhances existing robust methods for point cloud classification. + +
+
+
+
+
+ + ♻ ☆ On the Robustness of ChatGPT: An Adversarial and Out-of-distribution + Perspective ICLR 2023 + + +
+ ChatGPT is a recent chatbot service released by OpenAI and is receiving +increasing attention over the past few months. While evaluations of various +aspects of ChatGPT have been done, its robustness, i.e., the performance to +unexpected inputs, is still unclear to the public. Robustness is of particular +concern in responsible AI, especially for safety-critical applications. In this +paper, we conduct a thorough evaluation of the robustness of ChatGPT from the +adversarial and out-of-distribution (OOD) perspective. To do so, we employ the +AdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart +review and DDXPlus medical diagnosis datasets for OOD evaluation. We select +several popular foundation models as baselines. Results show that ChatGPT shows +consistent advantages on most adversarial and OOD classification and +translation tasks. However, the absolute performance is far from perfection, +which suggests that adversarial and OOD robustness remains a significant threat +to foundation models. Moreover, ChatGPT shows astounding performance in +understanding dialogue-related texts and we find that it tends to provide +informal suggestions for medical tasks instead of definitive answers. Finally, +we present in-depth discussions of possible research directions. + +
+
+ comment: Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable + Large-Scale Machine Learning Models; code is at: + https://github.com/microsoft/robustlearn; more works: + https://llm-eval.github.io/ +
+
+
+
+
+ + ♻ ☆ Provable Acceleration of Heavy Ball beyond Quadratics for a Class of + Polyak-Łojasiewicz Functions when the Non-Convexity is Averaged-Out ICML 2022 + + +
+ Heavy Ball (HB) nowadays is one of the most popular momentum methods in +non-convex optimization. It has been widely observed that incorporating the +Heavy Ball dynamic in gradient-based methods accelerates the training process +of modern machine learning models. However, the progress on establishing its +theoretical foundation of acceleration is apparently far behind its empirical +success. Existing provable acceleration results are of the quadratic or +close-to-quadratic functions, as the current techniques of showing HB's +acceleration are limited to the case when the Hessian is fixed. In this work, +we develop some new techniques that help show acceleration beyond quadratics, +which is achieved by analyzing how the change of the Hessian at two consecutive +time points affects the convergence speed. Based on our technical results, a +class of Polyak-\L{}ojasiewicz (PL) optimization problems for which provable +acceleration can be achieved via HB is identified. Moreover, our analysis +demonstrates a benefit of adaptively setting the momentum parameter. + (Update: 08/29/2023) Erratum is added in Appendix J. This is an updated +version that fixes an issue in the previous version. An additional condition +needs to be satisfied for the acceleration result of HB beyond quadratics in +this work, which naturally holds when the dimension is one or, more broadly, +when the Hessian is diagonal. We elaborate on the issue in Appendix J. + +
+
+ comment: (ICML 2022) Proceedings of the 39th International Conference on + Machine Learning; +
+
+
+
+
+ + ♻ ☆ Semi-supervised Vector-valued Learning: Improved Bounds and Algorithms + + +
+ Vector-valued learning, where the output space admits a vector-valued +structure, is an important problem that covers a broad family of important +domains, e.g. multi-task learning and transfer learning. Using local Rademacher +complexity and unlabeled data, we derive novel semi-supervised excess risk +bounds for general vector-valued learning from both kernel perspective and +linear perspective. The derived bounds are much sharper than existing ones and +the convergence rates are improved from the square root of labeled sample size +to the square root of total sample size or directly dependent on labeled sample +size. Motivated by our theoretical analysis, we propose a general +semi-supervised algorithm for efficiently learning vector-valued functions, +incorporating both local Rademacher complexity and Laplacian regularization. +Extensive experimental results illustrate the proposed algorithm significantly +outperforms the compared methods, which coincides with our theoretical +findings. + +
+
+ comment: Accepted at Pattern Recognition +
+
+
+
+
+ + ♻ ☆ All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with + Prompt-based Finetuning + + +
+ The Segment Anything Model (SAM) is a recently proposed prompt-based +segmentation model in a generic zero-shot segmentation approach. With the +zero-shot segmentation capacity, SAM achieved impressive flexibility and +precision on various segmentation tasks. However, the current pipeline requires +manual prompts during the inference stage, which is still resource intensive +for biomedical image segmentation. In this paper, instead of using prompts +during the inference stage, we introduce a pipeline that utilizes the SAM, +called all-in-SAM, through the entire AI development workflow (from annotation +generation to model finetuning) without requiring manual prompts during the +inference stage. Specifically, SAM is first employed to generate pixel-level +annotations from weak prompts (e.g., points, bounding box). Then, the +pixel-level annotations are used to finetune the SAM segmentation model rather +than training from scratch. Our experimental results reveal two key findings: +1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a +nuclei segmentation task on the public Monuseg dataset, and 2) the utilization +of weak and few annotations for SAM finetuning achieves competitive performance +compared to using strong pixel-wise annotated data. + +
+
+
+
+
+ + ♻ ☆ The Wyner Variational Autoencoder for Unsupervised Multi-Layer Wireless + Fingerprinting + + +
+ Wireless fingerprinting refers to a device identification method leveraging +hardware imperfections and wireless channel variations as signatures. Beyond +physical layer characteristics, recent studies demonstrated that user behaviors +could be identified through network traffic, e.g., packet length, without +decryption of the payload. Inspired by these results, we propose a multi-layer +fingerprinting framework that jointly considers the multi-layer signatures for +improved identification performance. In contrast to previous works, by +leveraging the recent multi-view machine learning paradigm, i.e., data with +multiple forms, our method can cluster the device information shared among the +multi-layer features without supervision. Our information-theoretic approach +can be extended to supervised and semi-supervised settings with straightforward +derivations. In solving the formulated problem, we obtain a tight surrogate +bound using variational inference for efficient optimization. In extracting the +shared device information, we develop an algorithm based on the Wyner common +information method, enjoying reduced computation complexity as compared to +existing approaches. The algorithm can be applied to data distributions +belonging to the exponential family class. Empirically, we evaluate the +algorithm in a synthetic dataset with real-world video traffic and simulated +physical layer characteristics. Our empirical results show that the proposed +method outperforms the state-of-the-art baselines in both supervised and +unsupervised settings. + +
+
+
+
+
+ + ♻ ☆ Group Equality in Adaptive Submodular Maximization + + +
+ In this paper, we study the classic submodular maximization problem subject +to a group equality constraint under both non-adaptive and adaptive settings. +It has been shown that the utility function of many machine learning +applications, including data summarization, influence maximization in social +networks, and personalized recommendation, satisfies the property of +submodularity. Hence, maximizing a submodular function subject to various +constraints can be found at the heart of many of those applications. On a high +level, submodular maximization aims to select a group of most representative +items (e.g., data points). However, the design of most existing algorithms does +not incorporate the fairness constraint, leading to under- or +over-representation of some particular groups. This motivates us to study the +submodular maximization problem with group equality, where we aim to select a +group of items to maximize a (possibly non-monotone) submodular utility +function subject to a group equality constraint. To this end, we develop the +first constant-factor approximation algorithm for this problem. The design of +our algorithm is robust enough to be extended to solving the submodular +maximization problem under a more complicated adaptive setting. Moreover, we +further extend our study to incorporating a global cardinality constraint and +other fairness notations. + +
+
+ comment: This paper has been accepted by INFORMS Journal on Computing +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning for Generative AI: A Survey + + +
+ Deep Generative AI has been a long-standing essential topic in the machine +learning community, which can impact a number of application areas like text +generation and computer vision. The major paradigm to train a generative model +is maximum likelihood estimation, which pushes the learner to capture and +approximate the target data distribution by decreasing the divergence between +the model distribution and the target distribution. This formulation +successfully establishes the objective of generative tasks, while it is +incapable of satisfying all the requirements that a user might expect from a +generative model. Reinforcement learning, serving as a competitive option to +inject new training signals by creating new objectives that exploit novel +signals, has demonstrated its power and flexibility to incorporate human +inductive bias from multiple angles, such as adversarial learning, +hand-designed rules and learned reward model to build a performant model. +Thereby, reinforcement learning has become a trending research field and has +stretched the limits of generative AI in both model design and application. It +is reasonable to summarize and conclude advances in recent years with a +comprehensive review. Although there are surveys in different application areas +recently, this survey aims to shed light on a high-level review that spans a +range of application areas. We provide a rigorous taxonomy in this area and +make sufficient coverage on various models and applications. Notably, we also +surveyed the fast-developing large language model area. We conclude this survey +by showing the potential directions that might tackle the limit of current +models and expand the frontiers for generative AI. + +
+
+
+
+
+ + ♻ ☆ Symmetry-Preserving Program Representations for Learning Code Semantics + + +
+ Large Language Models (LLMs) have shown promise in automated program +reasoning, a crucial aspect of many security tasks. However, existing LLM +architectures for code are often borrowed from other domains like natural +language processing, raising concerns about their generalization and robustness +to unseen code. A key generalization challenge is to incorporate the knowledge +of code semantics, including control and data flow, into the LLM architectures. + Drawing inspiration from examples of convolution layers exploiting +translation symmetry, we explore how code symmetries can enhance LLM +architectures for program analysis and modeling. We present a rigorous +group-theoretic framework that formally defines code symmetries as +semantics-preserving transformations and provides techniques for precisely +reasoning about symmetry preservation within LLM architectures. Using this +framework, we introduce a novel variant of self-attention that preserves +program symmetries, demonstrating its effectiveness in generalization and +robustness through detailed experimental evaluations across different binary +and source code analysis tasks. Overall, our code symmetry framework offers +rigorous and powerful reasoning techniques that can guide the future +development of specialized LLMs for code and advance LLM-guided program +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems: + Challenges and Opportunities + + +
+ Multi-Sensor Fusion (MSF) based perception systems have been the foundation +in supporting many industrial applications and domains, such as self-driving +cars, robotic arms, and unmanned aerial vehicles. Over the past few years, the +fast progress in data-driven artificial intelligence (AI) has brought a +fast-increasing trend to empower MSF systems by deep learning techniques to +further improve performance, especially on intelligent systems and their +perception systems. Although quite a few AI-enabled MSF perception systems and +techniques have been proposed, up to the present, limited benchmarks that focus +on MSF perception are publicly available. Given that many intelligent systems +such as self-driving cars are operated in safety-critical contexts where +perception systems play an important role, there comes an urgent need for a +more in-depth understanding of the performance and reliability of these MSF +systems. To bridge this gap, we initiate an early step in this direction and +construct a public benchmark of AI-enabled MSF-based perception systems +including three commonly adopted tasks (i.e., object detection, object +tracking, and depth completion). Based on this, to comprehensively understand +MSF systems' robustness and reliability, we design 14 common and realistic +corruption patterns to synthesize large-scale corrupted datasets. We further +perform a systematic evaluation of these systems through our large-scale +evaluation. Our results reveal the vulnerability of the current AI-enabled MSF +perception systems, calling for researchers and practitioners to take +robustness and reliability into account when designing AI-enabled MSF. + +
+
+ comment: To appear in ESEC/FSE 2023 +
+
+
+
+
+ + ♻ ☆ Block-State Transformer + + +
+ State space models (SSMs) have shown impressive results on tasks that require +modeling long-range dependencies and efficiently scale to long sequences owing +to their subquadratic runtime complexity. Originally designed for continuous +signals, SSMs have shown superior performance on a plethora of tasks, in vision +and audio; however, SSMs still lag Transformer performance in Language Modeling +tasks. In this work, we propose a hybrid layer named Block-State Transformer +(BST), that internally combines an SSM sublayer for long-range +contextualization, and a Block Transformer sublayer for short-term +representation of sequences. We study three different, and completely +parallelizable, variants that integrate SSMs and block-wise attention. We show +that our model outperforms similar Transformer-based architectures on language +modeling perplexity and generalizes to longer sequences. In addition, the +Block-State Transformer demonstrates more than tenfold increase in speed at the +layer level compared to the Block-Recurrent Transformer when model +parallelization is employed. + +
+
+
+
+
+ + ♻ ☆ On Optimal Caching and Model Multiplexing for Large Model Inference + + +
+ Large Language Models (LLMs) and other large foundation models have achieved +noteworthy success, but their size exacerbates existing resource consumption +and latency challenges. In particular, the large-scale deployment of these +models is hindered by the significant resource requirements during inference. +In this paper, we study two approaches for mitigating these challenges: +employing a cache to store previous queries and learning a model multiplexer to +choose from an ensemble of models for query processing. + Theoretically, we provide an optimal algorithm for jointly optimizing both +approaches to reduce the inference cost in both offline and online tabular +settings. By combining a caching algorithm, namely Greedy Dual Size with +Frequency (GDSF) or Least Expected Cost (LEC), with a model multiplexer, we +achieve optimal rates in both offline and online settings. Empirically, +simulations show that the combination of our caching and model multiplexing +algorithms greatly improves over the baselines, with up to $50\times$ +improvement over the baseline when the ratio between the maximum cost and +minimum cost is $100$. Experiments on real datasets show a $4.3\times$ +improvement in FLOPs over the baseline when the ratio for FLOPs is $10$, and a +$1.8\times$ improvement in latency when the ratio for average latency is +$1.85$. + +
+
+
+
+
+ + ♻ ☆ Fix Fairness, Don't Ruin Accuracy: Performance Aware Fairness Repair + using AutoML + + +
+ Machine learning (ML) is increasingly being used in critical decision-making +software, but incidents have raised questions about the fairness of ML +predictions. To address this issue, new tools and methods are needed to +mitigate bias in ML-based software. Previous studies have proposed bias +mitigation algorithms that only work in specific situations and often result in +a loss of accuracy. Our proposed solution is a novel approach that utilizes +automated machine learning (AutoML) techniques to mitigate bias. Our approach +includes two key innovations: a novel optimization function and a +fairness-aware search space. By improving the default optimization function of +AutoML and incorporating fairness objectives, we are able to mitigate bias with +little to no loss of accuracy. Additionally, we propose a fairness-aware search +space pruning method for AutoML to reduce computational cost and repair time. +Our approach, built on the state-of-the-art Auto-Sklearn tool, is designed to +reduce bias in real-world scenarios. In order to demonstrate the effectiveness +of our approach, we evaluated our approach on four fairness problems and 16 +different ML models, and our results show a significant improvement over the +baseline and existing bias mitigation techniques. Our approach, Fair-AutoML, +successfully repaired 60 out of 64 buggy cases, while existing bias mitigation +techniques only repaired up to 44 out of 64 cases. + +
+
+ comment: In Proceedings of The 31st ACM Joint European Software Engineering + Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE + 2023) +
+
+
+
+
+ + ♻ ☆ Variational Inference for Deblending Crowded Starfields + + +
+ In images collected by astronomical surveys, stars and galaxies often overlap +visually. Deblending is the task of distinguishing and characterizing +individual light sources in survey images. We propose StarNet, a Bayesian +method to deblend sources in astronomical images of crowded star fields. +StarNet leverages recent advances in variational inference, including amortized +variational distributions and an optimization objective targeting an +expectation of the forward KL divergence. In our experiments with SDSS images +of the M2 globular cluster, StarNet is substantially more accurate than two +competing methods: Probabilistic Cataloging (PCAT), a method that uses MCMC for +inference, and DAOPHOT, a software pipeline employed by SDSS for deblending. In +addition, the amortized approach to inference gives StarNet the scaling +characteristics necessary to perform Bayesian inference on modern astronomical +surveys. + +
+
+
+
+
+ + ♻ ☆ On the Existence of the Adversarial Bayes Classifier (Extended Version) NeurIPS + + +
+ Adversarial robustness is a critical property in a variety of modern machine +learning applications. While it has been the subject of several recent +theoretical studies, many important questions related to adversarial robustness +are still open. In this work, we study a fundamental question regarding Bayes +optimality for adversarial robustness. We provide general sufficient conditions +under which the existence of a Bayes optimal classifier can be guaranteed for +adversarial robustness. Our results can provide a useful tool for a subsequent +study of surrogate losses in adversarial robustness and their consistency +properties. This manuscript is the extended and corrected version of the paper +\emph{On the Existence of the Adversarial Bayes Classifier} published in +NeurIPS 2021. There were two errors in theorem statements in the original paper +-- one in the definition of pseudo-certifiable robustness and the other in the +measurability of $A^\e$ for arbitrary metric spaces. In this version we correct +the errors. Furthermore, the results of the original paper did not apply to +some non-strictly convex norms and here we extend our results to all possible +norms. + +
+
+ comment: 27 pages, 3 figures. Version 2: Corrects 2 errors in the paper "On + the Existence of the Adversarial Bayes Classifier" published in NeurIPS. + Version 3: Update to acknowledgements +
+
+
+
+
+ + ♻ ☆ Human-Inspired Multi-Agent Navigation using Knowledge Distillation IROS + + +
+ Despite significant advancements in the field of multi-agent navigation, +agents still lack the sophistication and intelligence that humans exhibit in +multi-agent settings. In this paper, we propose a framework for learning a +human-like general collision avoidance policy for agent-agent interactions in +fully decentralized, multi-agent environments. Our approach uses knowledge +distillation with reinforcement learning to shape the reward function based on +expert policies extracted from human trajectory demonstrations through behavior +cloning. We show that agents trained with our approach can take human-like +trajectories in collision avoidance and goal-directed steering tasks not +provided by the demonstrations, outperforming the experts as well as +learning-based agents trained without knowledge distillation. + +
+
+ comment: IEEE/RSJ International Conference on Intelligent Robots and Systems + (IROS), 2021 +
+
+
+
+
+ + ♻ ☆ Preserving Privacy and Security in Federated Learning + + +
+ Federated learning is known to be vulnerable to both security and privacy +issues. Existing research has focused either on preventing poisoning attacks +from users or on concealing the local model updates from the server, but not +both. However, integrating these two lines of research remains a crucial +challenge since they often conflict with one another with respect to the threat +model. In this work, we develop a principle framework that offers both privacy +guarantees for users and detection against poisoning attacks from them. With a +new threat model that includes both an honest-but-curious server and malicious +users, we first propose a secure aggregation protocol using homomorphic +encryption for the server to combine local model updates in a private manner. +Then, a zero-knowledge proof protocol is leveraged to shift the task of +detecting attacks in the local models from the server to the users. The key +observation here is that the server no longer needs access to the local models +for attack detection. Therefore, our framework enables the central server to +identify poisoned model updates without violating the privacy guarantees of +secure aggregation. + +
+
+ comment: Published in IEEE/ACM Transactions on Networking +
+
+
+
+
+ + ♻ ☆ Compressive Fourier collocation methods for high-dimensional diffusion + equations with periodic boundary conditions + + +
+ High-dimensional Partial Differential Equations (PDEs) are a popular +mathematical modelling tool, with applications ranging from finance to +computational chemistry. However, standard numerical techniques for solving +these PDEs are typically affected by the curse of dimensionality. In this work, +we tackle this challenge while focusing on stationary diffusion equations +defined over a high-dimensional domain with periodic boundary conditions. +Inspired by recent progress in sparse function approximation in high +dimensions, we propose a new method called compressive Fourier collocation. +Combining ideas from compressive sensing and spectral collocation, our method +replaces the use of structured collocation grids with Monte Carlo sampling and +employs sparse recovery techniques, such as orthogonal matching pursuit and +$\ell^1$ minimization, to approximate the Fourier coefficients of the PDE +solution. We conduct a rigorous theoretical analysis showing that the +approximation error of the proposed method is comparable with the best $s$-term +approximation (with respect to the Fourier basis) to the solution. Using the +recently introduced framework of random sampling in bounded Riesz systems, our +analysis shows that the compressive Fourier collocation method mitigates the +curse of dimensionality with respect to the number of collocation points under +sufficient conditions on the regularity of the diffusion coefficient. We also +present numerical experiments that illustrate the accuracy and stability of the +method for the approximation of sparse and compressible solutions. + +
+
+ comment: 33 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Trustworthy Representation Learning Across Domains + + +
+ As AI systems have obtained significant performance to be deployed widely in +our daily live and human society, people both enjoy the benefits brought by +these technologies and suffer many social issues induced by these systems. To +make AI systems good enough and trustworthy, plenty of researches have been +done to build guidelines for trustworthy AI systems. Machine learning is one of +the most important parts for AI systems and representation learning is the +fundamental technology in machine learning. How to make the representation +learning trustworthy in real-world application, e.g., cross domain scenarios, +is very valuable and necessary for both machine learning and AI system fields. +Inspired by the concepts in trustworthy AI, we proposed the first trustworthy +representation learning across domains framework which includes four concepts, +i.e, robustness, privacy, fairness, and explainability, to give a comprehensive +literature review on this research direction. Specifically, we first introduce +the details of the proposed trustworthy framework for representation learning +across domains. Second, we provide basic notions and comprehensively summarize +existing methods for the trustworthy framework from four concepts. Finally, we +conclude this survey with insights and discussions on future research +directions. + +
+
+ comment: 38 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Eliciting Latent Predictions from Transformers with the Tuned Lens + + +
+ We analyze transformers from the perspective of iterative inference, seeking +to understand how model predictions are refined layer by layer. To do so, we +train an affine probe for each block in a frozen pretrained model, making it +possible to decode every hidden state into a distribution over the vocabulary. +Our method, the tuned lens, is a refinement of the earlier "logit lens" +technique, which yielded useful insights but is often brittle. + We test our method on various autoregressive language models with up to 20B +parameters, showing it to be more predictive, reliable and unbiased than the +logit lens. With causal experiments, we show the tuned lens uses similar +features to the model itself. We also find the trajectory of latent predictions +can be used to detect malicious inputs with high accuracy. All code needed to +reproduce our results can be found at +https://github.com/AlignmentResearch/tuned-lens. + +
+
+
+
+
+ + ♻ ☆ Contrastive Credibility Propagation for Reliable Semi-Supervised + Learning + + +
+ Producing labels for unlabeled data is error-prone, making semi-supervised +learning (SSL) troublesome. Often, little is known about when and why an +algorithm fails to outperform a supervised baseline. Using benchmark datasets, +we craft five common real-world SSL data scenarios: few-label, open-set, +noisy-label, and class distribution imbalance/misalignment in the labeled and +unlabeled sets. We propose a novel algorithm called Contrastive Credibility +Propagation (CCP) for deep SSL via iterative transductive pseudo-label +refinement. CCP unifies semi-supervised learning and noisy label learning for +the goal of reliably outperforming a supervised baseline in any data scenario. +Compared to prior methods which focus on a subset of scenarios, CCP uniquely +outperforms the supervised baseline in all scenarios, supporting practitioners +when the qualities of labeled or unlabeled data are unknown. + +
+
+
+
+
+ + ♻ ☆ Regression with Label Differential Privacy ICLR '23 + + +
+ We study the task of training regression models with the guarantee of label +differential privacy (DP). Based on a global prior distribution on label +values, which could be obtained privately, we derive a label DP randomization +mechanism that is optimal under a given regression loss function. We prove that +the optimal mechanism takes the form of a "randomized response on bins", and +propose an efficient algorithm for finding the optimal bin values. We carry out +a thorough experimental evaluation on several datasets demonstrating the +efficacy of our algorithm. + +
+
+ comment: Appeared at ICLR '23, 28 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ On Low-rank Trace Regression under General Sampling Distribution + + +
+ In this paper, we study the trace regression when a matrix of parameters B* +is estimated via the convex relaxation of a rank-regularized regression or via +regularized non-convex optimization. It is known that these estimators satisfy +near-optimal error bounds under assumptions on the rank, coherence, and +spikiness of B*. We start by introducing a general notion of spikiness for B* +that provides a generic recipe to prove the restricted strong convexity of the +sampling operator of the trace regression and obtain near-optimal and +non-asymptotic error bounds for the estimation error. Similar to the existing +literature, these results require the regularization parameter to be above a +certain theory-inspired threshold that depends on observation noise that may be +unknown in practice. Next, we extend the error bounds to cases where the +regularization parameter is chosen via cross-validation. This result is +significant in that existing theoretical results on cross-validated estimators +(Kale et al., 2011; Kumar et al., 2013; Abou-Moustafa and Szepesvari, 2017) do +not apply to our setting since the estimators we study are not known to satisfy +their required notion of stability. Finally, using simulations on synthetic and +real data, we show that the cross-validated estimator selects a near-optimal +penalty parameter and outperforms the theory-inspired approach of selecting the +parameter. + +
+
+ comment: 49 pages, 6 figure2 +
+
+
+
+
+ + ♻ ☆ GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content SP + + +
+ The mobile cloud gaming industry has been rapidly growing over the last +decade. When streaming gaming videos are transmitted to customers' client +devices from cloud servers, algorithms that can monitor distorted video quality +without having any reference video available are desirable tools. However, +creating No-Reference Video Quality Assessment (NR VQA) models that can +accurately predict the quality of streaming gaming videos rendered by computer +graphics engines is a challenging problem, since gaming content generally +differs statistically from naturalistic videos, often lacks detail, and +contains many smooth regions. Until recently, the problem has been further +complicated by the lack of adequate subjective quality databases of mobile +gaming content. We have created a new gaming-specific NR VQA model called the +Gaming Video Quality Evaluator (GAMIVAL), which combines and leverages the +advantages of spatial and temporal gaming distorted scene statistics models, a +neural noise model, and deep semantic features. Using a support vector +regression (SVR) as a regressor, GAMIVAL achieves superior performance on the +new LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database. + +
+
+ comment: Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been + made available online: https://github.com/lskdream/GAMIVAL +
+
+
+
+
+ + ♻ ☆ Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language + Models + + +
+ Dense retrieval (DR) converts queries and documents into dense embeddings and +measures the similarity between queries and documents in vector space. One of +the challenges in DR is the lack of domain-specific training data. While DR +models can learn from large-scale public datasets like MS MARCO through +transfer learning, evidence shows that not all DR models and domains can +benefit from transfer learning equally. Recently, some researchers have +resorted to large language models (LLMs) to improve the zero-shot and few-shot +DR models. However, the hard prompts or human-written prompts utilized in these +works cannot guarantee the good quality of generated weak queries. To tackle +this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task, +we leverage soft prompt-tuning to optimize a task-specific soft prompt on +limited ground truth data and then prompt the LLMs to tag unlabeled documents +with weak queries, yielding enough weak document-query pairs to train +task-specific dense retrievers. We design a filter to select high-quality +example document-query pairs in the prompt to further improve the quality of +weak tagged queries. To the best of our knowledge, there is no prior work +utilizing soft prompt tuning to augment DR models. The experiments demonstrate +that SPTAR outperforms the unsupervised baselines BM25 and the recently +proposed LLMs-based augmentation method for DR. + +
+
+ comment: fix typos +
+
+
+
+
+ + ♻ ☆ A Transformer-based Framework For Multi-variate Time Series: A Remaining + Useful Life Prediction Use Case + + +
+ In recent times, Large Language Models (LLMs) have captured a global +spotlight and revolutionized the field of Natural Language Processing. One of +the factors attributed to the effectiveness of LLMs is the model architecture +used for training, transformers. Transformer models excel at capturing +contextual features in sequential data since time series data are sequential, +transformer models can be leveraged for more efficient time series data +prediction. The field of prognostics is vital to system health management and +proper maintenance planning. A reliable estimation of the remaining useful life +(RUL) of machines holds the potential for substantial cost savings. This +includes avoiding abrupt machine failures, maximizing equipment usage, and +serving as a decision support system (DSS). This work proposed an +encoder-transformer architecture-based framework for multivariate time series +prediction for a prognostics use case. We validated the effectiveness of the +proposed framework on all four sets of the C-MAPPS benchmark dataset for the +remaining useful life prediction task. To effectively transfer the knowledge +and application of transformers from the natural language domain to time +series, three model-specific experiments were conducted. Also, to enable the +model awareness of the initial stages of the machine life and its degradation +path, a novel expanding window method was proposed for the first time in this +work, it was compared with the sliding window method, and it led to a large +improvement in the performance of the encoder transformer model. Finally, the +performance of the proposed encoder-transformer model was evaluated on the test +dataset and compared with the results from 13 other state-of-the-art (SOTA) +models in the literature and it outperformed them all with an average +performance increase of 137.65% over the next best model across all the +datasets. + +
+
+
+
+
+ + ♻ ☆ Diversifying AI: Towards Creative Chess with AlphaZero + + +
+ In recent years, Artificial Intelligence (AI) systems have surpassed human +intelligence in a variety of computational tasks. However, AI systems, like +humans, make mistakes, have blind spots, hallucinate, and struggle to +generalize to new situations. This work explores whether AI can benefit from +creative decision-making mechanisms when pushed to the limits of its +computational rationality. In particular, we investigate whether a team of +diverse AI systems can outperform a single AI in challenging tasks by +generating more ideas as a group and then selecting the best ones. We study +this question in the game of chess, the so-called drosophila of AI. We build on +AlphaZero (AZ) and extend it to represent a league of agents via a +latent-conditioned architecture, which we call AZ_db. We train AZ_db to +generate a wider range of ideas using behavioral diversity techniques and +select the most promising ones with sub-additive planning. Our experiments +suggest that AZ_db plays chess in diverse ways, solves more puzzles as a group +and outperforms a more homogeneous team. Notably, AZ_db solves twice as many +challenging puzzles as AZ, including the challenging Penrose positions. When +playing chess from different openings, we notice that players in AZ_db +specialize in different openings, and that selecting a player for each opening +using sub-additive planning results in a 50 Elo improvement over AZ. Our +findings suggest that diversity bonuses emerge in teams of AI agents, just as +they do in teams of humans and that diversity is a valuable asset in solving +computationally hard problems. + +
+
+
+
+
+ + ♻ ☆ RecXplainer: Amortized Attribute-based Personalized Explanations for + Recommender Systems NeurIPS 2022 + + +
+ Recommender systems influence many of our interactions in the digital world +-- impacting how we shop for clothes, sorting what we see when browsing YouTube +or TikTok, and determining which restaurants and hotels we are shown when using +hospitality platforms. Modern recommender systems are large, opaque models +trained on a mixture of proprietary and open-source datasets. Naturally, issues +of trust arise on both the developer and user side: is the system working +correctly, and why did a user receive (or not receive) a particular +recommendation? Providing an explanation alongside a recommendation alleviates +some of these concerns. The status quo for auxiliary recommender system +feedback is either user-specific explanations (e.g., "users who bought item B +also bought item A") or item-specific explanations (e.g., "we are recommending +item A because you watched/bought item B"). However, users bring personalized +context into their search experience, valuing an item as a function of that +item's attributes and their own personal preferences. In this work, we propose +RecXplainer, a novel method for generating fine-grained explanations based on a +user's preferences over the attributes of recommended items. We evaluate +RecXplainer on five real-world and large-scale recommendation datasets using +five different kinds of recommender systems to demonstrate the efficacy of +RecXplainer in capturing users' preferences over item attributes and using them +to explain recommendations. We also compare RecXplainer to five baselines and +show RecXplainer's exceptional performance on ten metrics. + +
+
+ comment: Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ Variationally Mimetic Operator Networks + + +
+ In recent years operator networks have emerged as promising deep learning +tools for approximating the solution to partial differential equations (PDEs). +These networks map input functions that describe material properties, forcing +functions and boundary data to the solution of a PDE. This work describes a new +architecture for operator networks that mimics the form of the numerical +solution obtained from an approximate variational or weak formulation of the +problem. The application of these ideas to a generic elliptic PDE leads to a +variationally mimetic operator network (VarMiON). Like the conventional Deep +Operator Network (DeepONet) the VarMiON is also composed of a sub-network that +constructs the basis functions for the output and another that constructs the +coefficients for these basis functions. However, in contrast to the DeepONet, +the architecture of these sub-networks in the VarMiON is precisely determined. +An analysis of the error in the VarMiON solution reveals that it contains +contributions from the error in the training data, the training error, the +quadrature error in sampling input and output functions, and a "covering error" +that measures the distance between the test input functions and the nearest +functions in the training dataset. It also depends on the stability constants +for the exact solution operator and its VarMiON approximation. The application +of the VarMiON to a canonical elliptic PDE and a nonlinear PDE reveals that for +approximately the same number of network parameters, on average the VarMiON +incurs smaller errors than a standard DeepONet and a recently proposed +multiple-input operator network (MIONet). Further, its performance is more +robust to variations in input functions, the techniques used to sample the +input and output functions, the techniques used to construct the basis +functions, and the number of input functions. + +
+
+ comment: 49 pages, 18 figures, 1 Appendix +
+
+
+
+
+ + ♻ ☆ EntropyRank: Unsupervised Keyphrase Extraction via Side-Information + Optimization for Language Model-based Text Compression + + +
+ We propose an unsupervised method to extract keywords and keyphrases from +texts based on a pre-trained language model (LM) and Shannon's information +maximization. Specifically, our method extracts phrases having the highest +conditional entropy under the LM. The resulting set of keyphrases turns out to +solve a relevant information-theoretic problem: if provided as side +information, it leads to the expected minimal binary code length in compressing +the text using the LM and an entropy encoder. Alternately, the resulting set is +an approximation via a causal LM to the set of phrases that minimize the +entropy of the text when conditioned upon it. Empirically, the method provides +results comparable to the most commonly used methods in various keyphrase +extraction benchmark challenges. + +
+
+
+
+
+ + ♻ ☆ The Future of Fundamental Science Led by Generative Closed-Loop + Artificial Intelligence + + +
+ Recent advances in machine learning and AI, including Generative AI and LLMs, +are disrupting technological innovation, product development, and society as a +whole. AI's contribution to technology can come from multiple approaches that +require access to large training data sets and clear performance evaluation +criteria, ranging from pattern recognition and classification to generative +models. Yet, AI has contributed less to fundamental science in part because +large data sets of high-quality data for scientific practice and model +discovery are more difficult to access. Generative AI, in general, and Large +Language Models in particular, may represent an opportunity to augment and +accelerate the scientific discovery of fundamental deep science with +quantitative models. Here we explore and investigate aspects of an AI-driven, +automated, closed-loop approach to scientific discovery, including self-driven +hypothesis generation and open-ended autonomous exploration of the hypothesis +space. Integrating AI-driven automation into the practice of science would +mitigate current problems, including the replication of findings, systematic +production of data, and ultimately democratisation of the scientific process. +Realising these possibilities requires a vision for augmented AI coupled with a +diversity of AI approaches able to deal with fundamental aspects of causality +analysis and model discovery while enabling unbiased search across the space of +putative explanations. These advances hold the promise to unleash AI's +potential for searching and discovering the fundamental structure of our world +beyond what human scientists have been able to achieve. Such a vision would +push the boundaries of new fundamental science rather than automatize current +workflows and instead open doors for technological innovation to tackle some of +the greatest challenges facing humanity today. + +
+
+ comment: 35 pages, first draft of the final report from the Alan Turing + Institute on AI for Scientific Discovery +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ On the Steganographic Capacity of Selected Learning Models + + +
+ Machine learning and deep learning models are potential vectors for various +attack scenarios. For example, previous research has shown that malware can be +hidden in deep learning models. Hiding information in a learning model can be +viewed as a form of steganography. In this research, we consider the general +question of the steganographic capacity of learning models. Specifically, for a +wide range of models, we determine the number of low-order bits of the trained +parameters that can be overwritten, without adversely affecting model +performance. For each model considered, we graph the accuracy as a function of +the number of low-order bits that have been overwritten, and for selected +models, we also analyze the steganographic capacity of individual layers. The +models that we test include the classic machine learning techniques of Linear +Regression (LR) and Support Vector Machine (SVM); the popular general deep +learning models of Multilayer Perceptron (MLP) and Convolutional Neural Network +(CNN); the highly-successful Recurrent Neural Network (RNN) architecture of +Long Short-Term Memory (LSTM); the pre-trained transfer learning-based models +VGG16, DenseNet121, InceptionV3, and Xception; and, finally, an Auxiliary +Classifier Generative Adversarial Network (ACGAN). In all cases, we find that a +majority of the bits of each trained parameter can be overwritten before the +accuracy degrades. Of the models tested, the steganographic capacity ranges +from 7.04 KB for our LR experiments, to 44.74 MB for InceptionV3. We discuss +the implications of our results and consider possible avenues for further +research. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2306.17189 +
+
+
+
+
+ + ♻ ☆ GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content SP + + +
+ The mobile cloud gaming industry has been rapidly growing over the last +decade. When streaming gaming videos are transmitted to customers' client +devices from cloud servers, algorithms that can monitor distorted video quality +without having any reference video available are desirable tools. However, +creating No-Reference Video Quality Assessment (NR VQA) models that can +accurately predict the quality of streaming gaming videos rendered by computer +graphics engines is a challenging problem, since gaming content generally +differs statistically from naturalistic videos, often lacks detail, and +contains many smooth regions. Until recently, the problem has been further +complicated by the lack of adequate subjective quality databases of mobile +gaming content. We have created a new gaming-specific NR VQA model called the +Gaming Video Quality Evaluator (GAMIVAL), which combines and leverages the +advantages of spatial and temporal gaming distorted scene statistics models, a +neural noise model, and deep semantic features. Using a support vector +regression (SVR) as a regressor, GAMIVAL achieves superior performance on the +new LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database. + +
+
+ comment: Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been + made available online: https://github.com/lskdream/GAMIVAL +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 59 + +
+
+
+ + ☆ Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual + Predatory Chats and Abusive Texts + + +
+ Detecting online sexual predatory behaviours and abusive language on social +media platforms has become a critical area of research due to the growing +concerns about online safety, especially for vulnerable populations such as +children and adolescents. Researchers have been exploring various techniques +and approaches to develop effective detection systems that can identify and +mitigate these risks. Recent development of large language models (LLMs) has +opened a new opportunity to address this problem more effectively. This paper +proposes an approach to detection of online sexual predatory chats and abusive +language using the open-source pretrained Llama 2 7B-parameter model, recently +released by Meta GenAI. We fine-tune the LLM using datasets with different +sizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu). +Based on the power of LLMs, our approach is generic and automated without a +manual search for a synergy between feature extraction and classifier design +steps like conventional methods in this domain. Experimental results show a +strong performance of the proposed approach, which performs proficiently and +consistently across three distinct datasets with five sets of experiments. This +study's outcomes indicate that the proposed method can be implemented in +real-world applications (even with non-English languages) for flagging sexual +predators, offensive or toxic content, hate speech, and discriminatory language +in online discussions and comments to maintain respectful internet or digital +communities. Furthermore, it can be employed for solving text classification +problems with other potential applications such as sentiment analysis, spam and +phishing detection, sorting legal documents, fake news detection, language +identification, user intent recognition, text-based product categorization, +medical record analysis, and resume screening. + +
+
+
+
+
+ + ☆ ANER: Arabic and Arabizi Named Entity Recognition using + Transformer-Based Approach + + +
+ One of the main tasks of Natural Language Processing (NLP), is Named Entity +Recognition (NER). It is used in many applications and also can be used as an +intermediate step for other tasks. We present ANER, a web-based named entity +recognizer for the Arabic, and Arabizi languages. The model is built upon BERT, +which is a transformer-based encoder. It can recognize 50 different entity +classes, covering various fields. We trained our model on the WikiFANE\_Gold +dataset which consists of Wikipedia articles. We achieved an F1 score of +88.7\%, which beats CAMeL Tools' F1 score of 83\% on the ANERcorp dataset, +which has only 4 classes. We also got an F1 score of 77.7\% on the +NewsFANE\_Gold dataset which contains out-of-domain data from News articles. +The system is deployed on a user-friendly web interface that accepts users' +inputs in Arabic, or Arabizi. It allows users to explore the entities in the +text by highlighting them. It can also direct users to get information about +entities through Wikipedia directly. We added the ability to do NER using our +model, or CAMeL Tools' model through our website. ANER is publicly accessible +at \url{http://www.aner.online}. We also deployed our model on HuggingFace at +https://huggingface.co/boda/ANER, to allow developers to test and use it. + +
+
+
+
+
+ + ☆ Joint Multiple Intent Detection and Slot Filling with Supervised + Contrastive Learning and Self-Distillation ECAI 2023 + + +
+ Multiple intent detection and slot filling are two fundamental and crucial +tasks in spoken language understanding. Motivated by the fact that the two +tasks are closely related, joint models that can detect intents and extract +slots simultaneously are preferred to individual models that perform each task +independently. The accuracy of a joint model depends heavily on the ability of +the model to transfer information between the two tasks so that the result of +one task can correct the result of the other. In addition, since a joint model +has multiple outputs, how to train the model effectively is also challenging. +In this paper, we present a method for multiple intent detection and slot +filling by addressing these challenges. First, we propose a bidirectional joint +model that explicitly employs intent information to recognize slots and slot +features to detect intents. Second, we introduce a novel method for training +the proposed joint model using supervised contrastive learning and +self-distillation. Experimental results on two benchmark datasets MixATIS and +MixSNIPS show that our method outperforms state-of-the-art models in both +tasks. The results also demonstrate the contributions of both bidirectional +design and the training method to the accuracy improvement. Our source code is +available at https://github.com/anhtunguyen98/BiSLU + +
+
+ comment: Accepted at ECAI 2023 +
+
+
+
+
+ + ☆ Challenges of GPT-3-based Conversational Agents for Healthca + + +
+ The potential to provide patients with faster information access while +allowing medical specialists to concentrate on critical tasks makes medical +domain dialog agents appealing. However, the integration of large-language +models (LLMs) into these agents presents certain limitations that may result in +serious consequences. This paper investigates the challenges and risks of using +GPT-3-based models for medical question-answering (MedQA). We perform several +evaluations contextualized in terms of standard medical principles. We provide +a procedure for manually designing patient queries to stress-test high-risk +limitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to +respond adequately to these queries, generating erroneous medical information, +unsafe recommendations, and content that may be considered offensive. + +
+
+ comment: 12 pages, 9 Tables, accepted to RANLP 2023 +
+
+
+
+
+ + ☆ Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance IJCAI-2023 + + +
+ We propose the use of conversational GPT models for easy and quick few-shot +text classification in the financial domain using the Banking77 dataset. Our +approach involves in-context learning with GPT-3.5 and GPT-4, which minimizes +the technical expertise required and eliminates the need for expensive GPU +computing while yielding quick and accurate results. Additionally, we fine-tune +other pre-trained, masked language models with SetFit, a recent contrastive +learning technique, to achieve state-of-the-art results both in full-data and +few-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can +outperform fine-tuned, non-generative models even with fewer examples. However, +subscription fees associated with these solutions may be considered costly for +small organizations. Lastly, we find that generative models perform better on +the given task when shown representative samples selected by a human expert +rather than when shown random ones. We conclude that a) our proposed methods +offer a practical solution for few-shot tasks in datasets with limited label +availability, and b) our state-of-the-art results can inspire future work in +the area. + +
+
+ comment: Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023 +
+
+
+
+
+ + ☆ AI in the Gray: Exploring Moderation Policies in Dialogic Large Language + Models vs. Human Answers in Controversial Topics + + +
+ The introduction of ChatGPT and the subsequent improvement of Large Language +Models (LLMs) have prompted more and more individuals to turn to the use of +ChatBots, both for information and assistance with decision-making. However, +the information the user is after is often not formulated by these ChatBots +objectively enough to be provided with a definite, globally accepted answer. + Controversial topics, such as "religion", "gender identity", "freedom of +speech", and "equality", among others, can be a source of conflict as partisan +or biased answers can reinforce preconceived notions or promote disinformation. +By exposing ChatGPT to such debatable questions, we aim to understand its level +of awareness and if existing models are subject to socio-political and/or +economic biases. We also aim to explore how AI-generated answers compare to +human ones. For exploring this, we use a dataset of a social media platform +created for the purpose of debating human-generated claims on polemic subjects +among users, dubbed Kialo. + Our results show that while previous versions of ChatGPT have had important +issues with controversial topics, more recent versions of ChatGPT +(gpt-3.5-turbo) are no longer manifesting significant explicit biases in +several knowledge areas. In particular, it is well-moderated regarding economic +aspects. However, it still maintains degrees of implicit libertarian leaning +toward right-winged ideals which suggest the need for increased moderation from +the socio-political point of view. In terms of domain knowledge on +controversial topics, with the exception of the "Philosophical" category, +ChatGPT is performing well in keeping up with the collective human level of +knowledge. Finally, we see that sources of Bing AI have slightly more tendency +to the center when compared to human answers. All the analyses we make are +generalizable to other types of biases and domains. + +
+
+
+
+
+ + ☆ Spoken Language Intelligence of Large Language Models for Language + Learning + + +
+ People have long hoped for a conversational system that can assist in +real-life situations, and recent progress on large language models (LLMs) is +bringing this idea closer to reality. While LLMs are often impressive in +performance, their efficacy in real-world scenarios that demand expert +knowledge remains unclear. LLMs are believed to hold the most potential and +value in education, especially in the development of Artificial intelligence +(AI) based virtual teachers capable of facilitating language learning. Our +focus is centered on evaluating the efficacy of LLMs in the realm of education, +specifically in the areas of spoken language learning which encompass +phonetics, phonology, and second language acquisition. We introduce a new +multiple-choice question dataset to evaluate the effectiveness of LLMs in the +aforementioned scenarios, including understanding and application of spoken +language knowledge. In addition, we investigate the influence of various +prompting techniques such as zero- and few-shot method (prepending the question +with question-answer exemplars), chain-of-thought (CoT, think step-by-step), +in-domain exampler and external tools (Google, Wikipedia). We conducted +large-scale evaluation on popular LLMs (20 distinct models) using these +methods. We achieved significant performance improvements compared to the +zero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% -> +63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different +sizes have good understanding of concepts in phonetics, phonology, and second +language acquisition, but show limitations in reasoning for real-world +problems. Additionally, we also explore preliminary findings on conversational +communication. + +
+
+ comment: 28 pages, 7 figures, Preprint +
+
+
+
+
+ + ☆ A Multi-Task Semantic Decomposition Framework with Task-specific + Pre-training for Few-Shot NER CIKM 2023 + + +
+ The objective of few-shot named entity recognition is to identify named +entities with limited labeled instances. Previous works have primarily focused +on optimizing the traditional token-wise classification framework, while +neglecting the exploration of information based on NER data characteristics. To +address this issue, we propose a Multi-Task Semantic Decomposition Framework +via Joint Task-specific Pre-training (MSDP) for few-shot NER. Drawing +inspiration from demonstration-based and contrastive learning, we introduce two +novel pre-training tasks: Demonstration-based Masked Language Modeling (MLM) +and Class Contrastive Discrimination. These tasks effectively incorporate +entity boundary information and enhance entity representation in Pre-trained +Language Models (PLMs). In the downstream main task, we introduce a multi-task +joint optimization framework with the semantic decomposing method, which +facilitates the model to integrate two different semantic information for +entity classification. Experimental results of two few-shot NER benchmarks +demonstrate that MSDP consistently outperforms strong baselines by a large +margin. Extensive analyses validate the effectiveness and generalization of +MSDP. + +
+
+ comment: Accepted by CIKM 2023 (Oral Presentation) +
+
+
+
+
+ + ☆ LongBench: A Bilingual, Multitask Benchmark for Long Context + Understanding + + +
+ Although large language models (LLMs) demonstrate impressive performance for +many language tasks, most of them can only handle texts a few thousand tokens +long, limiting their applications on longer sequence inputs, such as books, +reports, and codebases. Recent works have proposed methods to improve LLMs' +long context capabilities by extending context windows and more sophisticated +memory mechanisms. However, comprehensive benchmarks tailored for evaluating +long context understanding are lacking. In this paper, we introduce LongBench, +the first bilingual, multi-task benchmark for long context understanding, +enabling a more rigorous evaluation of long context understanding. LongBench +comprises 21 datasets across 6 task categories in both English and Chinese, +with an average length of 6,711 words (English) and 13,386 characters +(Chinese). These tasks cover key long-text application areas including +single-doc QA, multi-doc QA, summarization, few-shot learning, synthetic tasks, +and code completion. All datasets in LongBench are standardized into a unified +format, allowing for effortless automatic evaluation of LLMs. Upon +comprehensive evaluation of 8 LLMs on LongBench, we find that: (1) Commercial +model (GPT-3.5-Turbo-16k) outperforms other open-sourced models, but still +struggles on longer contexts. (2) Scaled position embedding and fine-tuning on +longer sequences lead to substantial improvement on long context understanding. +(3) Context compression technique such as retrieval brings improvement for +model with weak ability on long contexts, but the performance still lags behind +models that have strong long context understanding capability. The code and +datasets are available at https://github.com/THUDM/LongBench. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Multimodal Detection of Social Spambots in Twitter using Transformers + + +
+ Although not all bots are malicious, the vast majority of them are +responsible for spreading misinformation and manipulating the public opinion +about several issues, i.e., elections and many more. Therefore, the early +detection of social spambots is crucial. Although there have been proposed +methods for detecting bots in social media, there are still substantial +limitations. For instance, existing research initiatives still extract a large +number of features and train traditional machine learning algorithms or use +GloVe embeddings and train LSTMs. However, feature extraction is a tedious +procedure demanding domain expertise. Also, language models based on +transformers have been proved to be better than LSTMs. Other approaches create +large graphs and train graph neural networks requiring in this way many hours +for training and access to computational resources. To tackle these +limitations, this is the first study employing only the user description field +and images of three channels denoting the type and content of tweets posted by +the users. Firstly, we create digital DNA sequences, transform them to 3d +images, and apply pretrained models of the vision domain, including +EfficientNet, AlexNet, VGG16, etc. Next, we propose a multimodal approach, +where we use TwHIN-BERT for getting the textual representation of the user +description field and employ VGG16 for acquiring the visual representation for +the image modality. We propose three different fusion methods, namely +concatenation, gated multimodal unit, and crossmodal attention, for fusing the +different modalities and compare their performances. Extensive experiments +conducted on the Cresci '17 dataset demonstrate valuable advantages of our +introduced approaches over state-of-the-art ones reaching Accuracy up to +99.98%. + +
+
+
+
+
+ + ☆ An Empirical Study of Consistency Regularization for End-to-End + Speech-to-Text Translation + + +
+ Consistency regularization methods, such as R-Drop (Liang et al., 2021) and +CrossConST (Gao et al., 2023), have achieved impressive supervised and +zero-shot performance in the neural machine translation (NMT) field. Can we +also boost end-to-end (E2E) speech-to-text translation (ST) by leveraging +consistency regularization? In this paper, we conduct empirical studies on +intra-modal and cross-modal consistency and propose two training strategies, +SimRegCR and SimZeroCR, for E2E ST in regular and zero-shot scenarios. +Experiments on the MuST-C benchmark show that our approaches achieve +state-of-the-art (SOTA) performance in most translation directions. The +analyses prove that regularization brought by the intra-modal consistency, +instead of modality gap, is crucial for the regular E2E ST, and the cross-modal +consistency could close the modality gap and boost the zero-shot E2E ST +performance. + +
+
+
+
+
+ + ☆ Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware + Pre-training for KBQA CIKM 2023 + + +
+ Knowledge Base Question Answering (KBQA) aims to answer natural language +questions with factual information such as entities and relations in KBs. +However, traditional Pre-trained Language Models (PLMs) are directly +pre-trained on large-scale natural language corpus, which poses challenges for +them in understanding and representing complex subgraphs in structured KBs. To +bridge the gap between texts and structured KBs, we propose a Structured +Knowledge-aware Pre-training method (SKP). In the pre-training stage, we +introduce two novel structured knowledge-aware tasks, guiding the model to +effectively learn the implicit relationship and better representations of +complex subgraphs. In downstream KBQA task, we further design an efficient +linearization strategy and an interval attention mechanism, which assist the +model to better encode complex subgraphs and shield the interference of +irrelevant subgraphs during reasoning respectively. Detailed experiments and +analyses on WebQSP verify the effectiveness of SKP, especially the significant +improvement in subgraph retrieval (+4.08% H@10). + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Biomedical Entity Linking with Triple-aware Pre-Training + + +
+ Linking biomedical entities is an essential aspect in biomedical natural +language processing tasks, such as text mining and question answering. However, +a difficulty of linking the biomedical entities using current large language +models (LLM) trained on a general corpus is that biomedical entities are +scarcely distributed in texts and therefore have been rarely seen during +training by the LLM. At the same time, those LLMs are not aware of high level +semantic connection between different biomedical entities, which are useful in +identifying similar concepts in different textual contexts. To cope with +aforementioned problems, some recent works focused on injecting knowledge graph +information into LLMs. However, former methods either ignore the relational +knowledge of the entities or lead to catastrophic forgetting. Therefore, we +propose a novel framework to pre-train the powerful generative LLM by a corpus +synthesized from a KG. In the evaluations we are unable to confirm the benefit +of including synonym, description or relational information. + +
+
+
+
+
+ + ☆ GADePo: Graph-Assisted Declarative Pooling Transformers for + Document-Level Relation Extraction + + +
+ Document-level relation extraction aims to identify relationships between +entities within a document. Current methods rely on text-based encoders and +employ various hand-coded pooling heuristics to aggregate information from +entity mentions and associated contexts. In this paper, we replace these rigid +pooling functions with explicit graph relations by leveraging the intrinsic +graph processing capabilities of the Transformer model. We propose a joint +text-graph Transformer model, and a graph-assisted declarative pooling (GADePo) +specification of the input which provides explicit and high-level instructions +for information aggregation. This allows the pooling process to be guided by +domain-specific knowledge or desired outcomes but still learned by the +Transformer, leading to more flexible and customizable pooling strategies. We +extensively evaluate our method across diverse datasets and models, and show +that our approach yields promising results that are comparable to those +achieved by the hand-coded pooling functions. + +
+
+
+
+
+ + ☆ FIRE: Food Image to REcipe generation + + +
+ Food computing has emerged as a prominent multidisciplinary field of research +in recent years. An ambitious goal of food computing is to develop end-to-end +intelligent systems capable of autonomously producing recipe information for a +food image. Current image-to-recipe methods are retrieval-based and their +success depends heavily on the dataset size and diversity, as well as the +quality of learned embeddings. Meanwhile, the emergence of powerful +attention-based vision and language models presents a promising avenue for +accurate and generalizable recipe generation, which has yet to be extensively +explored. This paper proposes FIRE, a novel multimodal methodology tailored to +recipe generation in the food computing domain, which generates the food title, +ingredients, and cooking instructions based on input food images. FIRE +leverages the BLIP model to generate titles, utilizes a Vision Transformer with +a decoder for ingredient extraction, and employs the T5 model to generate +recipes incorporating titles and ingredients as inputs. We showcase two +practical applications that can benefit from integrating FIRE with large +language model prompting: recipe customization to fit recipes to user +preferences and recipe-to-code transformation to enable automated cooking +processes. Our experimental findings validate the efficacy of our proposed +approach, underscoring its potential for future advancements and widespread +adoption in food computing. + +
+
+ comment: 5 figures, 4 tables +
+
+
+
+
+ + ☆ Effect of Attention and Self-Supervised Speech Embeddings on + Non-Semantic Speech Tasks + + +
+ Human emotion understanding is pivotal in making conversational technology +mainstream. We view speech emotion understanding as a perception task which is +a more realistic setting. With varying contexts (languages, demographics, etc.) +different share of people perceive the same speech segment as a non-unanimous +emotion. As part of the ACM Multimedia 2023 Computational Paralinguistics +ChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset +of multilingual speakers and multi-label regression target of 'emotion share' +or perception of that emotion. We demonstrate that the training scheme of +different foundation models dictates their effectiveness for tasks beyond +speech recognition, especially for non-semantic speech tasks like emotion + understanding. This is a very complex task due to multilingual speakers, +variability in the target labels, and inherent imbalance in the regression +dataset. Our results show that HuBERT-Large with a self-attention-based +light-weight sequence model provides 4.6% improvement over the reported +baseline. + +
+
+ comment: Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges + Track +
+
+
+
+
+ + ☆ ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large + Language Models + + +
+ The unprecedented performance of large language models (LLMs) requires +comprehensive and accurate evaluation. We argue that for LLMs evaluation, +benchmarks need to be comprehensive and systematic. To this end, we propose the +ZhuJiu benchmark, which has the following strengths: (1) Multi-dimensional +ability coverage: We comprehensively evaluate LLMs across 7 ability dimensions +covering 51 tasks. Especially, we also propose a new benchmark that focuses on +knowledge ability of LLMs. (2) Multi-faceted evaluation methods collaboration: +We use 3 different yet complementary evaluation methods to comprehensively +evaluate LLMs, which can ensure the authority and accuracy of the evaluation +results. (3) Comprehensive Chinese benchmark: ZhuJiu is the pioneering +benchmark that fully assesses LLMs in Chinese, while also providing equally +robust evaluation abilities in English. (4) Avoiding potential data leakage: To +avoid data leakage, we construct evaluation data specifically for 37 tasks. We +evaluate 10 current mainstream LLMs and conduct an in-depth discussion and +analysis of their results. The ZhuJiu benchmark and open-participation +leaderboard are publicly released at http://www.zhujiu-benchmark.com/ and we +also provide a demo video at https://youtu.be/qypkJ89L1Ic. + +
+
+
+
+
+ + ☆ EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models + + +
+ Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a +revolution in machine intelligence, owing to their exceptional capabilities in +a wide range of machine learning tasks. However, the transition of LLMs from +data centers to edge devices presents a set of challenges and opportunities. +While this shift can enhance privacy and availability, it is hampered by the +enormous parameter sizes of these models, leading to impractical runtime costs. +In light of these considerations, we introduce EdgeMoE, the first on-device +inference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant +of sparse LLMs that exhibit nearly constant computational complexity as their +parameter size scales. EdgeMoE achieves both memory and computational +efficiency by strategically partitioning the model across the storage +hierarchy. Specifically, non-expert weights are stored in the device's memory, +while expert weights are kept in external storage and are fetched into memory +only when they are activated. This design is underpinned by a crucial insight +that expert weights, though voluminous, are infrequently accessed due to sparse +activation patterns. To further mitigate the overhead associated with expert +I/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise +bitwidth adaptation: This method reduces the size of expert weights with an +acceptable level of accuracy loss. (2) Expert management: It predicts the +experts that will be activated in advance and preloads them into the +compute-I/O pipeline, thus further optimizing the process. In empirical +evaluations conducted on well-established MoE LLMs and various edge devices, +EdgeMoE demonstrates substantial memory savings and performance improvements +when compared to competitive baseline solutions. + +
+
+
+
+
+ + ☆ DISC-MedLLM: Bridging General Large Language Models and Real-World + Medical Consultation + + +
+ We propose DISC-MedLLM, a comprehensive solution that leverages Large +Language Models (LLMs) to provide accurate and truthful medical response in +end-to-end conversational healthcare services. To construct high-quality +Supervised Fine-Tuning (SFT) datasets, we employ three strategies: utilizing +medical knowledge-graphs, reconstructing real-world dialogues, and +incorporating human-guided preference rephrasing. These datasets are +instrumental in training DISC-MedLLM, surpassing existing medical LLMs in both +single-turn and multi-turn consultation scenarios. Extensive experimental +results demonstrate the effectiveness of the proposed model in bridging the gap +between general language models and real-world medical consultation. +Additionally, we release the constructed dataset and model weights to further +contribute to research and development. Further details and resources can be +found at https://github.com/FudanDISC/DISC-MedLLM + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Cognitive Effects in Large Language Models ECAI + + +
+ Large Language Models (LLMs) such as ChatGPT have received enormous attention +over the past year and are now used by hundreds of millions of people every +day. The rapid adoption of this technology naturally raises questions about the +possible biases such models might exhibit. In this work, we tested one of these +models (GPT-3) on a range of cognitive effects, which are systematic patterns +that are usually found in human cognitive tasks. We found that LLMs are indeed +prone to several human cognitive effects. Specifically, we show that the +priming, distance, SNARC, and size congruity effects were presented with GPT-3, +while the anchoring effect is absent. We describe our methodology, and +specifically the way we converted real-world experiments to text-based +experiments. Finally, we speculate on the possible reasons why GPT-3 exhibits +these effects and discuss whether they are imitated or reinvented. + +
+
+ comment: Accepted and will be published in the ECAI conference +
+
+
+
+
+ + ☆ Leveraging A Medical Knowledge Graph into Large Language Models for + Diagnosis Prediction + + +
+ Electronic Health Records (EHRs) and routine documentation practices play a +vital role in patients' daily care, providing a holistic record of health, +diagnoses, and treatment. However, complex and verbose EHR narratives overload +healthcare providers, risking diagnostic inaccuracies. While Large Language +Models (LLMs) have showcased their potential in diverse language tasks, their +application in the healthcare arena needs to ensure the minimization of +diagnostic errors and the prevention of patient harm. In this paper, we outline +an innovative approach for augmenting the proficiency of LLMs in the realm of +automated diagnosis generation, achieved through the incorporation of a medical +knowledge graph (KG) and a novel graph model: Dr.Knows, inspired by the +clinical diagnostic reasoning process. We derive the KG from the National +Library of Medicine's Unified Medical Language System (UMLS), a robust +repository of biomedical knowledge. Our method negates the need for +pre-training and instead leverages the KG as an auxiliary instrument aiding in +the interpretation and summarization of complex medical concepts. Using +real-world hospital datasets, our experimental results demonstrate that the +proposed approach of combining LLMs with KG has the potential to improve the +accuracy of automated diagnosis generation. More importantly, our approach +offers an explainable diagnostic pathway, edging us closer to the realization +of AI-augmented diagnostic decision support systems. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Evaluating the Robustness to Instructions of Large Language Models + + +
+ Recently, Instruction fine-tuning has risen to prominence as a potential +method for enhancing the zero-shot capabilities of Large Language Models (LLMs) +on novel tasks. This technique has shown an exceptional ability to boost the +performance of moderately sized LLMs, sometimes even reaching performance +levels comparable to those of much larger model variants. The focus is on the +robustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an +exploration of six models including Alpaca, Vicuna, WizardLM, and Traditional +Task-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction +datasets as case studies. We carried out a comprehensive evaluation of these +instruction-following LLMs which have been tuned based on open-domain +instructions and task-oriented instructions. The main discussion is their +performance and robustness towards instructions. We have observed that in most +cases, the model's performance in dealing with unfamiliar instructions tends to +worsen significantly, and the robustness of the model for RE instructions +deteriorates compared to QA. Further, we discovered that up until a certain +parameter size threshold (3B), the performance of the FLAN-T5 model improves as +the parameter count increases. The robustness of different scales of FLAN-T5 +models to RE instruction is worse than the robustness to QA instruction. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ FonMTL: Towards Multitask Learning for the Fon Language EMNLP 2023 + + +
+ The Fon language, spoken by an average 2 million of people, is a truly +low-resourced African language, with a limited online presence, and existing +datasets (just to name but a few). Multitask learning is a learning paradigm +that aims to improve the generalization capacity of a model by sharing +knowledge across different but related tasks: this could be prevalent in very +data-scarce scenarios. In this paper, we present the first explorative approach +to multitask learning, for model capabilities enhancement in Natural Language +Processing for the Fon language. Specifically, we explore the tasks of Named +Entity Recognition (NER) and Part of Speech Tagging (POS) for Fon. We leverage +two language model heads as encoders to build shared representations for the +inputs, and we use linear layers blocks for classification relative to each +task. Our results on the NER and POS tasks for Fon, show competitive (or +better) performances compared to several multilingual pretrained language +models finetuned on single tasks. Additionally, we perform a few ablation +studies to leverage the efficiency of two different loss combination strategies +and find out that the equal loss weighting approach works best in our case. Our +code is open-sourced at https://github.com/bonaventuredossou/multitask_fon. + +
+
+ comment: Accepted at WiNLP workshop, co-located at EMNLP 2023 +
+
+
+
+
+ + ☆ Goodhart's Law Applies to NLP's Explanation Benchmarks + + +
+ Despite the rising popularity of saliency-based explanations, the research +community remains at an impasse, facing doubts concerning their purpose, +efficacy, and tendency to contradict each other. Seeking to unite the +community's efforts around common goals, several recent works have proposed +evaluation metrics. In this paper, we critically examine two sets of metrics: +the ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics, +focusing our inquiry on natural language processing. First, we show that we can +inflate a model's comprehensiveness and sufficiency scores dramatically without +altering its predictions or explanations on in-distribution test inputs. Our +strategy exploits the tendency for extracted explanations and their complements +to be "out-of-support" relative to each other and in-distribution inputs. Next, +we demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple +method that encodes the label, even though EVAL-X is precisely motivated to +address such exploits. Our results raise doubts about the ability of current +metrics to guide explainability research, underscoring the need for a broader +reassessment of what precisely these metrics are intended to capture. + +
+
+
+
+
+ + ☆ SalesBot 2.0: A Human-Like Intent-Guided Chit-Chat Dataset + + +
+ In recent research on dialogue systems and corpora, there has been a +significant focus on two distinct categories: task-oriented (TOD) and +open-domain (chit-chat) dialogues. TOD systems aim to satisfy specific user +goals, such as finding a movie to watch, whereas open-domain systems primarily +focus on generating engaging conversations. A recent study by Chiu et al. +(2022) introduced SalesBot, which provides simulators and a dataset with +one-turn transition from chit-chat to task-oriented dialogues. However, the +previously generated data solely relied on BlenderBot, which raised concerns +about its long-turn naturalness and consistency during a conversation. To +address this issue, this paper aims to build SalesBot 2.0, a revised version of +the published data, by leveraging the commonsense knowledge of large language +models (LLMs) through proper prompting. The objective is to gradually bridge +the gap between chit-chat and TOD towards better naturalness and consistency. +The newly released large-scale dataset with detailed annotations exhibits +smoother transitions between topics and is more human-like in terms of +naturalness and consistency. It can serve as a valuable resource for both +academic research and commercial applications. Furthermore, our proposed +framework can be applied to generate numerous dialogues with various target +intents. + +
+
+
+
+
+ + ☆ The Cultural Psychology of Large Language Models: Is ChatGPT a Holistic + or Analytic Thinker? + + +
+ The prevalent use of Large Language Models (LLMs) has necessitated studying +their mental models, yielding noteworthy theoretical and practical +implications. Current research has demonstrated that state-of-the-art LLMs, +such as ChatGPT, exhibit certain theory of mind capabilities and possess +relatively stable Big Five and/or MBTI personality traits. In addition, +cognitive process features form an essential component of these mental models. +Research in cultural psychology indicated significant differences in the +cognitive processes of Eastern and Western people when processing information +and making judgments. While Westerners predominantly exhibit analytical +thinking that isolates things from their environment to analyze their nature +independently, Easterners often showcase holistic thinking, emphasizing +relationships and adopting a global viewpoint. In our research, we probed the +cultural cognitive traits of ChatGPT. We employed two scales that directly +measure the cognitive process: the Analysis-Holism Scale (AHS) and the Triadic +Categorization Task (TCT). Additionally, we used two scales that investigate +the value differences shaped by cultural thinking: the Dialectical Self Scale +(DSS) and the Self-construal Scale (SCS). In cognitive process tests (AHS/TCT), +ChatGPT consistently tends towards Eastern holistic thinking, but regarding +value judgments (DSS/SCS), ChatGPT does not significantly lean towards the East +or the West. We suggest that the result could be attributed to both the +training paradigm and the training data in LLM development. We discuss the +potential value of this finding for AI research and directions for future +research. + +
+
+
+
+
+ + ☆ Gender bias and stereotypes in Large Language Models + + +
+ Large Language Models (LLMs) have made substantial progress in the past +several months, shattering state-of-the-art benchmarks in many domains. This +paper investigates LLMs' behavior with respect to gender stereotypes, a known +issue for prior models. We use a simple paradigm to test the presence of gender +bias, building on but differing from WinoBias, a commonly used gender bias +dataset, which is likely to be included in the training data of current LLMs. +We test four recently published LLMs and demonstrate that they express biased +assumptions about men and women's occupations. Our contributions in this paper +are as follows: (a) LLMs are 3-6 times more likely to choose an occupation that +stereotypically aligns with a person's gender; (b) these choices align with +people's perceptions better than with the ground truth as reflected in official +job statistics; (c) LLMs in fact amplify the bias beyond what is reflected in +perceptions or the ground truth; (d) LLMs ignore crucial ambiguities in +sentence structure 95% of the time in our study items, but when explicitly +prompted, they recognize the ambiguity; (e) LLMs provide explanations for their +choices that are factually inaccurate and likely obscure the true reason behind +their predictions. That is, they provide rationalizations of their biased +behavior. This highlights a key property of these models: LLMs are trained on +imbalanced datasets; as such, even with the recent successes of reinforcement +learning with human feedback, they tend to reflect those imbalances back at us. +As with other types of societal biases, we suggest that LLMs must be carefully +tested to ensure that they treat minoritized individuals and communities +equitably. + +
+
+ comment: ACM Collective Intelligence +
+
+
+
+
+ + ☆ Neural approaches to spoken content embedding + + +
+ Comparing spoken segments is a central operation to speech processing. +Traditional approaches in this area have favored frame-level dynamic +programming algorithms, such as dynamic time warping, because they require no +supervision, but they are limited in performance and efficiency. As an +alternative, acoustic word embeddings -- fixed-dimensional vector +representations of variable-length spoken word segments -- have begun to be +considered for such tasks as well. However, the current space of such +discriminative embedding models, training approaches, and their application to +real-world downstream tasks is limited. We start by considering ``single-view" +training losses where the goal is to learn an acoustic word embedding model +that separates same-word and different-word spoken segment pairs. Then, we +consider ``multi-view" contrastive losses. In this setting, acoustic word +embeddings are learned jointly with embeddings of character sequences to +generate acoustically grounded embeddings of written words, or acoustically +grounded word embeddings. + In this thesis, we contribute new discriminative acoustic word embedding +(AWE) and acoustically grounded word embedding (AGWE) approaches based on +recurrent neural networks (RNNs). We improve model training in terms of both +efficiency and performance. We take these developments beyond English to +several low-resource languages and show that multilingual training improves +performance when labeled data is limited. We apply our embedding models, both +monolingual and multilingual, to the downstream tasks of query-by-example +speech search and automatic speech recognition. Finally, we show how our +embedding approaches compare with and complement more recent self-supervised +speech models. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ MEMORY-VQ: Compression for Tractable Internet-Scale Memory + + +
+ Retrieval augmentation is a powerful but expensive method to make language +models more knowledgeable about the world. Memory-based methods like LUMEN +pre-compute token representations for retrieved passages to drastically speed +up inference. However, memory also leads to much greater storage requirements +from storing pre-computed representations. + We propose MEMORY-VQ, a new method to reduce storage requirements of +memory-augmented models without sacrificing performance. Our method uses a +vector quantization variational autoencoder (VQ-VAE) to compress token +representations. We apply MEMORY-VQ to the LUMEN model to obtain LUMEN-VQ, a +memory model that achieves a 16x compression rate with comparable performance +on the KILT benchmark. LUMEN-VQ enables practical retrieval augmentation even +for extremely large retrieval corpora. + +
+
+
+
+
+ + ☆ Multiscale Contextual Learning for Speech Emotion Recognition in + Emergency Call Center Conversations + + +
+ Emotion recognition in conversations is essential for ensuring advanced +human-machine interactions. However, creating robust and accurate emotion +recognition systems in real life is challenging, mainly due to the scarcity of +emotion datasets collected in the wild and the inability to take into account +the dialogue context. The CEMO dataset, composed of conversations between +agents and patients during emergency calls to a French call center, fills this +gap. The nature of these interactions highlights the role of the emotional flow +of the conversation in predicting patient emotions, as context can often make a +difference in understanding actual feelings. This paper presents a multi-scale +conversational context learning approach for speech emotion recognition, which +takes advantage of this hypothesis. We investigated this approach on both +speech transcriptions and acoustic segments. Experimentally, our method uses +the previous or next information of the targeted segment. In the text domain, +we tested the context window using a wide range of tokens (from 10 to 100) and +at the speech turns level, considering inputs from both the same and opposing +speakers. According to our tests, the context derived from previous tokens has +a more significant influence on accurate prediction than the following tokens. +Furthermore, taking the last speech turn of the same speaker in the +conversation seems useful. In the acoustic domain, we conducted an in-depth +analysis of the impact of the surrounding emotions on the prediction. While +multi-scale conversational context learning using Transformers can enhance +performance in the textual modality for emergency call recordings, +incorporating acoustic context is more challenging. + +
+
+
+
+
+ + ☆ CommunityFish: A Poisson-based Document Scaling With Hierarchical + Clustering + + +
+ Document scaling has been a key component in text-as-data applications for +social scientists and a major field of interest for political researchers, who +aim at uncovering differences between speakers or parties with the help of +different probabilistic and non-probabilistic approaches. Yet, most of these +techniques are either built upon the agnostically bag-of-word hypothesis or use +prior information borrowed from external sources that might embed the results +with a significant bias. If the corpus has long been considered as a collection +of documents, it can also be seen as a dense network of connected words whose +structure could be clustered to differentiate independent groups of words, +based on their co-occurrences in documents, known as communities. This paper +introduces CommunityFish as an augmented version of Wordfish based on a +hierarchical clustering, namely the Louvain algorithm, on the word space to +yield communities as semantic and independent n-grams emerging from the corpus +and use them as an input to Wordfish method, instead of considering the word +space. This strategy emphasizes the interpretability of the results, since +communities have a non-overlapping structure, hence a crucial informative power +in discriminating parties or speakers, in addition to allowing a faster +execution of the Poisson scaling model. Aside from yielding communities, +assumed to be subtopic proxies, the application of this technique outperforms +the classic Wordfish model by highlighting historical developments in the U.S. +State of the Union addresses and was found to replicate the prevailing +political stance in Germany when using the corpus of parties' legislative +manifestos. + +
+
+
+
+
+ + ☆ Attention Visualizer Package: Revealing Word Importance for Deeper + Insight into Encoder-Only Transformer Models + + +
+ This report introduces the Attention Visualizer package, which is crafted to +visually illustrate the significance of individual words in encoder-only +transformer-based models. In contrast to other methods that center on tokens +and self-attention scores, our approach will examine the words and their impact +on the final embedding representation. Libraries like this play a crucial role +in enhancing the interpretability and explainability of neural networks. They +offer the opportunity to illuminate their internal mechanisms, providing a +better understanding of how they operate and can be enhanced. You can access +the code and review examples on the following GitHub repository: +https://github.com/AlaFalaki/AttentionVisualizer. + +
+
+ comment: 12 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Parameter-Efficient Finetuning for Robust Continual Multilingual + Learning ACL + + +
+ We introduce and study the problem of Continual Multilingual Learning (CML) +where a previously trained multilingual model is periodically updated using new +data arriving in stages. If the new data is present only in a subset of +languages, we find that the resulting model shows improved performance only on +the languages included in the latest update (and a few closely related +languages) while its performance on all the remaining languages degrade +significantly. We address this challenge by proposing LAFT-URIEL, a +parameter-efficient finetuning strategy which aims to increase the number of +languages on which the model improves after an update, while reducing the +magnitude of loss in performance for the remaining languages. LAFT-URIEL uses +linguistic knowledge to balance overfitting and knowledge sharing across +languages, allowing for an additional 25% of task languages to see an +improvement in performance after an update, while also reducing the average +magnitude of losses on the remaining languages by 78% relative. + +
+
+ comment: Published at ACL Findings 2023 +
+
+
+
+
+ + ♻ ☆ Training and Meta-Evaluating Machine Translation Evaluation Metrics at + the Paragraph Level + + +
+ As research on machine translation moves to translating text beyond the +sentence level, it remains unclear how effective automatic evaluation metrics +are at scoring longer translations. In this work, we first propose a method for +creating paragraph-level data for training and meta-evaluating metrics from +existing sentence-level data. Then, we use these new datasets to benchmark +existing sentence-level metrics as well as train learned metrics at the +paragraph level. Interestingly, our experimental results demonstrate that using +sentence-level metrics to score entire paragraphs is equally as effective as +using a metric designed to work at the paragraph level. We speculate this +result can be attributed to properties of the task of reference-based +evaluation as well as limitations of our datasets with respect to capturing all +types of phenomena that occur in paragraph-level translations. + +
+
+ comment: Removing extra "and" from author list +
+
+
+
+
+ + ♻ ☆ Evaluating Open-QA Evaluation + + +
+ This study focuses on the evaluation of the Open Question Answering (Open-QA) +task, which can directly estimate the factuality of large language models +(LLMs). Current automatic evaluation methods have shown limitations, indicating +that human evaluation still remains the most reliable approach. We introduce a +new task, Evaluating QA Evaluation (QA-Eval) and the corresponding dataset +EVOUNA, designed to assess the accuracy of AI-generated answers in relation to +standard answers within Open-QA. Our evaluation of these methods utilizes +human-annotated results to measure their performance. Specifically, the work +investigates methods that show high correlation with human evaluations, deeming +them more reliable. We also discuss the pitfalls of current methods and methods +to improve LLM-based evaluators. We believe this new QA-Eval task and +corresponding dataset EVOUNA will facilitate the development of more effective +automatic evaluation tools and prove valuable for future research in this area. +All resources are available at \url{https://github.com/wangcunxiang/QA-Eval} +and it is under the Apache-2.0 License. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3 + (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Quantum Circuit Compiler for a Shuttling-Based Trapped-Ion Quantum + Computer + + +
+ The increasing capabilities of quantum computing hardware and the challenge +of realizing deep quantum circuits require fully automated and efficient tools +for compiling quantum circuits. To express arbitrary circuits in a sequence of +native gates specific to the quantum computer architecture, it is necessary to +make algorithms portable across the landscape of quantum hardware providers. In +this work, we present a compiler capable of transforming and optimizing a +quantum circuit targeting a shuttling-based trapped-ion quantum processor. It +consists of custom algorithms set on top of the quantum circuit framework +Pytket. The performance was evaluated for a wide range of quantum circuits and +the results show that the gate counts can be reduced by factors up to 5.1 +compared to standard Pytket and up to 2.2 compared to standard Qiskit +compilation. + +
+
+ comment: 35 pages, 25 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Large Language Models Vote: Prompting for Rare Disease Identification + + +
+ The emergence of generative Large Language Models (LLMs) emphasizes the need +for accurate and efficient prompting approaches. LLMs are often applied in +Few-Shot Learning (FSL) contexts, where tasks are executed with minimal +training data. FSL has become popular in many Artificial Intelligence (AI) +subdomains, including AI for health. Rare diseases affect a small fraction of +the population. Rare disease identification from clinical notes inherently +requires FSL techniques due to limited data availability. Manual data +collection and annotation is both expensive and time-consuming. In this paper, +we propose Models-Vote Prompting (MVP), a flexible prompting approach for +improving the performance of LLM queries in FSL settings. MVP works by +prompting numerous LLMs to perform the same tasks and then conducting a +majority vote on the resulting outputs. This method achieves improved results +to any one model in the ensemble on one-shot rare disease identification and +classification tasks. We also release a novel rare disease dataset for FSL, +available to those who signed the MIMIC-IV Data Use Agreement (DUA). +Furthermore, in using MVP, each model is prompted multiple times, substantially +increasing the time needed for manual annotation, and to address this, we +assess the feasibility of using JSON for automating generative LLM evaluation. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Deciphering Tabular Data Using Large Language Model + + +
+ In the realm of natural language processing, the understanding of tabular +data has perpetually stood as a focal point of scholarly inquiry. The emergence +of expansive language models, exemplified by the likes of ChatGPT, has ushered +in a wave of endeavors wherein researchers aim to harness these models for +tasks related to table-based question answering. Central to our investigative +pursuits is the elucidation of methodologies that amplify the aptitude of such +large language models in discerning both the structural intricacies and +inherent content of tables, ultimately facilitating their capacity to provide +informed responses to pertinent queries. To this end, we have architected a +distinctive module dedicated to the serialization of tables for seamless +integration with expansive language models. Additionally, we've instituted a +corrective mechanism within the model to rectify potential inaccuracies. +Experimental results indicate that, although our proposed method trails the +SOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about +1.2% in tests on specific datasets. This research marks the first application +of large language models to table-based question answering tasks, enhancing the +model's comprehension of both table structures and content. + +
+
+
+
+
+ + ♻ ☆ Making first order linear logic a generating grammar + + +
+ It is known that different categorial grammars have surface representation in +a fragment of first order multiplicative linear logic (MLL1). We show that the +fragment of interest is equivalent to the recently introduced extended tensor +type calculus (ETTC). ETTC is a calculus of specific typed terms, which +represent tuples of strings, more precisely bipartite graphs decorated with +strings. Types are derived from linear logic formulas, and rules correspond to +concrete operations on these string-labeled graphs, so that they can be +conveniently visualized. This provides the above mentioned fragment of MLL1 +that is relevant for language modeling not only with some alternative syntax +and intuitive geometric representation, but also with an intrinsic deductive +system, which has been absent. + In this work we consider a non-trivial notationally enriched variation of the +previously introduced {\bf ETTC}, which allows more concise and transparent +computations. We present both a cut-free sequent calculus and a natural +deduction formalism. + +
+
+ comment: Revised and extended version with detailed proofs. arXiv admin note: + substantial text overlap with arXiv:2112.15253 +
+
+
+
+
+ + ♻ ☆ Towards Versatile and Efficient Visual Knowledge Integration into + Pre-trained Language Models with Cross-Modal Adapters + + +
+ Humans learn language via multi-modal knowledge. However, due to the +text-only pre-training scheme, most existing pre-trained language models (PLMs) +are hindered from the multi-modal information. + To inject visual knowledge into PLMs, existing methods incorporate either the +text or image encoder of vision-language models (VLMs) to encode the visual +information and update all the original parameters of PLMs for knowledge +fusion. + In this paper, we propose a new plug-and-play module, X-adapter, to flexibly +leverage the aligned visual and textual knowledge learned in pre-trained VLMs +and efficiently inject them into PLMs. + Specifically, we insert X-adapters into PLMs, and only the added parameters +are updated during adaptation. + To fully exploit the potential in VLMs, X-adapters consist of two +sub-modules, V-expert and T-expert, to fuse VLMs' image and text +representations, respectively. + We can opt for activating different sub-modules depending on the downstream +tasks. + Experimental results show that our method can significantly improve the +performance on object-color reasoning and natural language understanding (NLU) +tasks compared with PLM baselines. + +
+
+
+
+
+ + ♻ ☆ Enhancing Self-Disclosure In Neural Dialog Models By Candidate + Re-ranking + + +
+ Neural language modelling has progressed the state-of-the-art in different +downstream Natural Language Processing (NLP) tasks. One such area is of +open-domain dialog modelling, neural dialog models based on GPT-2 such as +DialoGPT have shown promising performance in single-turn conversation. However, +such (neural) dialog models have been criticized for generating responses which +although may have relevance to the previous human response, tend to quickly +dissipate human interest and descend into trivial conversation. One reason for +such performance is the lack of explicit conversation strategy being employed +in human-machine conversation. Humans employ a range of conversation strategies +while engaging in a conversation, one such key social strategies is +Self-disclosure(SD). A phenomenon of revealing information about one-self to +others. Social penetration theory (SPT) proposes that communication between two +people moves from shallow to deeper levels as the relationship progresses +primarily through self-disclosure. Disclosure helps in creating rapport among +the participants engaged in a conversation. In this paper, Self-disclosure +enhancement architecture (SDEA) is introduced utilizing Self-disclosure Topic +Model (SDTM) during inference stage of a neural dialog model to re-rank +response candidates to enhance self-disclosure in single-turn responses from +from the model. + +
+
+ comment: 10 pages, 3 figures, 2 table +
+
+
+
+
+ + ♻ ☆ Out of the Cage: How Stochastic Parrots Win in Cyber Security + Environments + + +
+ Large Language Models (LLMs) have gained widespread popularity across diverse +domains involving text generation, summarization, and various natural language +processing tasks. Despite their inherent limitations, LLM-based designs have +shown promising capabilities in planning and navigating open-world scenarios. +This paper introduces a novel application of pre-trained LLMs as agents within +cybersecurity network environments, focusing on their utility for sequential +decision-making processes. + We present an approach wherein pre-trained LLMs are leveraged as attacking +agents in two reinforcement learning environments. Our proposed agents +demonstrate similar or better performance against state-of-the-art agents +trained for thousands of episodes in most scenarios and configurations. In +addition, the best LLM agents perform similarly to human testers of the +environment without any additional training process. This design highlights the +potential of LLMs to efficiently address complex decision-making tasks within +cybersecurity. + Furthermore, we introduce a new network security environment named +NetSecGame. The environment is designed to eventually support complex +multi-agent scenarios within the network security domain. The proposed +environment mimics real network attacks and is designed to be highly modular +and adaptable for various scenarios. + +
+
+ comment: Under review. 10 pages plus appendices, 7 figures, 4 tables. Edit: + fix e-mails and code repository +
+
+
+
+
+ + ♻ ☆ Comparing Abstractive Summaries Generated by ChatGPT to Real Summaries + Through Blinded Reviewers and Text Classification Algorithms + + +
+ Large Language Models (LLMs) have gathered significant attention due to their +impressive performance on a variety of tasks. ChatGPT, developed by OpenAI, is +a recent addition to the family of language models and is being called a +disruptive technology by a few, owing to its human-like text-generation +capabilities. Although, many anecdotal examples across the internet have +evaluated ChatGPT's strength and weakness, only a few systematic research +studies exist. To contribute to the body of literature of systematic research +on ChatGPT, we evaluate the performance of ChatGPT on Abstractive Summarization +by the means of automated metrics and blinded human reviewers. We also build +automatic text classifiers to detect ChatGPT generated summaries. We found that +while text classification algorithms can distinguish between real and generated +summaries, humans are unable to distinguish between real summaries and those +produced by ChatGPT. + +
+
+
+
+
+ + ♻ ☆ The Effects of Political Martyrdom on Election Results: The + Assassination of Abe + + +
+ In developed nations assassinations are rare and thus the impact of such acts +on the electoral and political landscape is understudied. In this paper, we +focus on Twitter data to examine the effects of Japan's former Primer Minister +Abe's assassination on the Japanese House of Councillors elections in 2022. We +utilize sentiment analysis and emotion detection together with topic modeling +on over 2 million tweets and compare them against tweets during previous +election cycles. Our findings indicate that Twitter sentiments were negatively +impacted by the event in the short term and that social media attention span +has shortened. We also discuss how "necropolitics" affected the outcome of the +elections in favor of the deceased's party meaning that there seems to have +been an effect of Abe's death on the election outcome though the findings +warrant further investigation for conclusive results. + +
+
+
+
+
+ + ♻ ☆ EmotionIC: Emotional Inertia and Contagion-Driven Dependency Modeling + for Emotion Recognition in Conversation + + +
+ Emotion Recognition in Conversation (ERC) has attracted growing attention in +recent years as a result of the advancement and implementation of +human-computer interface technologies. In this paper, we propose a novel +approach to dependency modeling driven by Emotional Inertia and Contagion +(EmotionIC) for ERC task. Our EmotionIC consists of three main components, +i.e., Identity Masked Multi-Head Attention (IMMHA), Dialogue-based Gated +Recurrent Unit (DiaGRU), and Skip-chain Conditional Random Field (SkipCRF). +Compared to previous ERC models, EmotionIC can model a conversation more +thoroughly at both the feature-extraction and classification levels. The +proposed model attempts to integrate the advantages of attention- and +recurrence-based methods at the feature-extraction level. Specifically, IMMHA +is applied to capture identity-based global contextual dependencies, while +DiaGRU is utilized to extract speaker- and temporal-aware local contextual +information. At the classification level, SkipCRF can explicitly mine complex +emotional flows from higher-order neighboring utterances in the conversation. +Experimental results show that our method can significantly outperform the +state-of-the-art models on four benchmark datasets. The ablation studies +confirm that our modules can effectively model emotional inertia and contagion. + +
+
+ comment: 19 pages,10 figures +
+
+
+
+
+ + ♻ ☆ Communicative Agents for Software Development + + +
+ Software engineering is a domain characterized by intricate decision-making +processes, often relying on nuanced intuition and consultation. Recent +advancements in deep learning have started to revolutionize software +engineering practices through elaborate designs implemented at various stages +of software development. In this paper, we present an innovative paradigm that +leverages large language models (LLMs) throughout the entire software +development process, streamlining and unifying key processes through natural +language communication, thereby eliminating the need for specialized models at +each phase. At the core of this paradigm lies ChatDev, a virtual chat-powered +software development company that mirrors the established waterfall model, +meticulously dividing the development process into four distinct chronological +stages: designing, coding, testing, and documenting. Each stage engages a team +of agents, such as programmers, code reviewers, and test engineers, fostering +collaborative dialogue and facilitating a seamless workflow. The chat chain +acts as a facilitator, breaking down each stage into atomic subtasks. This +enables dual roles, allowing for proposing and validating solutions through +context-aware communication, leading to efficient resolution of specific +subtasks. The instrumental analysis of ChatDev highlights its remarkable +efficacy in software generation, enabling the completion of the entire software +development process in under seven minutes at a cost of less than one dollar. +It not only identifies and alleviates potential vulnerabilities but also +rectifies potential hallucinations while maintaining commendable efficiency and +cost-effectiveness. The potential of ChatDev unveils fresh possibilities for +integrating LLMs into the realm of software development. + +
+
+ comment: https://github.com/OpenBMB/ChatDev +
+
+
+
+
+ + ♻ ☆ Latent Jailbreak: A Benchmark for Evaluating Text Safety and Output + Robustness of Large Language Models + + +
+ Considerable research efforts have been devoted to ensuring that large +language models (LLMs) align with human values and generate safe text. However, +an excessive focus on sensitivity to certain topics can compromise the model's +robustness in following instructions, thereby impacting its overall performance +in completing tasks. Previous benchmarks for jailbreaking LLMs have primarily +focused on evaluating the safety of the models without considering their +robustness. In this paper, we propose a benchmark that assesses both the safety +and robustness of LLMs, emphasizing the need for a balanced approach. To +comprehensively study text safety and output robustness, we introduce a latent +jailbreak prompt dataset, each involving malicious instruction embedding. +Specifically, we instruct the model to complete a regular task, such as +translation, with the text to be translated containing malicious instructions. +To further analyze safety and robustness, we design a hierarchical annotation +framework. We present a systematic analysis of the safety and robustness of +LLMs regarding the position of explicit normal instructions, word replacements +(verbs in explicit normal instructions, target groups in malicious +instructions, cue words for explicit normal instructions), and instruction +replacements (different explicit normal instructions). Our results demonstrate +that current LLMs not only prioritize certain instruction verbs but also +exhibit varying jailbreak rates for different instruction verbs in explicit +normal instructions. Code and data are available at +https://github.com/qiuhuachuan/latent-jailbreak. + +
+
+ comment: Code and data are available at + https://github.com/qiuhuachuan/latent-jailbreak +
+
+
+
+
+ + ♻ ☆ Efficient Domain Adaptation of Sentence Embeddings Using Adapters + + +
+ Sentence embeddings enable us to capture the semantic similarity of short +texts. Most sentence embedding models are trained for general semantic textual +similarity tasks. Therefore, to use sentence embeddings in a particular domain, +the model must be adapted to it in order to achieve good results. Usually, this +is done by fine-tuning the entire sentence embedding model for the domain of +interest. While this approach yields state-of-the-art results, all of the +model's weights are updated during fine-tuning, making this method +resource-intensive. Therefore, instead of fine-tuning entire sentence embedding +models for each target domain individually, we propose to train lightweight +adapters. These domain-specific adapters do not require fine-tuning all +underlying sentence embedding model parameters. Instead, we only train a small +number of additional parameters while keeping the weights of the underlying +sentence embedding model fixed. Training domain-specific adapters allows always +using the same base model and only exchanging the domain-specific adapters to +adapt sentence embeddings to a specific domain. We show that using adapters for +parameter-efficient domain adaptation of sentence embeddings yields competitive +performance within 1% of a domain-adapted, entirely fine-tuned sentence +embedding model while only training approximately 3.6% of the parameters. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ The Re-Label Method For Data-Centric Machine Learning + + +
+ In industry deep learning application, our manually labeled data has a +certain number of noisy data. To solve this problem and achieve more than 90 +score in dev dataset, we present a simple method to find the noisy data and +re-label the noisy data by human, given the model predictions as references in +human labeling. In this paper, we illustrate our idea for a broad set of deep +learning tasks, includes classification, sequence tagging, object detection, +sequence generation, click-through rate prediction. The experimental results +and human evaluation results verify our idea. + +
+
+
+
+
+ + ♻ ☆ MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large + Language Models + + +
+ LLMs usually exhibit limitations in their ability to incorporate new +knowledge, the generation of hallucinations, and the transparency of their +decision-making process. In this paper, we explore how to prompt LLMs with +knowledge graphs (KG), working as a remedy to engage LLMs with up-to-date +knowledge and elicit the reasoning pathways from LLMs. Specifically, we build a +prompting pipeline that endows LLMs with the capability of comprehending KG +inputs and inferring with a combined implicit knowledge and the retrieved +external knowledge. In addition, we investigate eliciting the mind map on which +LLMs perform the reasoning and generate the answers. It is identified that the +produced mind map exhibits the reasoning pathways of LLMs grounded on the +ontology of knowledge, hence bringing the prospects of probing and gauging LLM +inference in production. The experiments on three question & answering datasets +also show that MindMap prompting leads to a striking empirical gain. For +instance, prompting a GPT-3.5 with MindMap yields an overwhelming performance +over GPT-4 consistently. We also demonstrate that with structured facts +retrieved from KG, MindMap can outperform a series of +prompting-with-document-retrieval methods, benefiting from more accurate, +concise, and comprehensive knowledge from KGs. + +
+
+ comment: 7 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Region-Aware Pretraining for Open-Vocabulary Object Detection with + Vision Transformers CVPR 2023 + + +
+ We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a +contrastive image-text pretraining recipe to bridge the gap between image-level +pretraining and open-vocabulary object detection. At the pretraining phase, we +propose to randomly crop and resize regions of positional embeddings instead of +using the whole image positional embeddings. This better matches the use of +positional embeddings at region-level in the detection finetuning phase. In +addition, we replace the common softmax cross entropy loss in contrastive +learning with focal loss to better learn the informative yet difficult +examples. Finally, we leverage recent advances in novel object proposals to +improve open-vocabulary detection finetuning. We evaluate our full model on the +LVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer. +RO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best +existing approach by +7.8 points in addition to competitive zero-shot transfer +detection. Surprisingly, RO-ViT improves the image-level representation as well +and achieves the state of the art on 9 out of 12 metrics on COCO and Flickr +image-text retrieval benchmarks, outperforming competitive approaches with +larger models. + +
+
+ comment: CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B + result +
+
+
+
+
+ + ♻ ☆ Explaining Machine Learning Models in Natural Conversations: Towards a + Conversational XAI Agent + + +
+ The goal of Explainable AI (XAI) is to design methods to provide insights +into the reasoning process of black-box models, such as deep neural networks, +in order to explain them to humans. Social science research states that such +explanations should be conversational, similar to human-to-human explanations. +In this work, we show how to incorporate XAI in a conversational agent, using a +standard design for the agent comprising natural language understanding and +generation components. We build upon an XAI question bank which we extend by +quality-controlled paraphrases to understand the user's information needs. We +further systematically survey the literature for suitable explanation methods +that provide the information to answer those questions, and present a +comprehensive list of suggestions. Our work is the first step towards truly +natural conversations about machine learning models with an explanation agent. +The comprehensive list of XAI questions and the corresponding explanation +methods may support other researchers in providing the necessary information to +address users' demands. + +
+
+ comment: Accepted at The World Conference on eXplainable Artificial + Intelligence 2023 (XAI-2023) +
+
+
+
+
+ + ♻ ☆ A Survey on Evaluation of Large Language Models + + +
+ Large language models (LLMs) are gaining increasing popularity in both +academia and industry, owing to their unprecedented performance in various +applications. As LLMs continue to play a vital role in both research and daily +use, their evaluation becomes increasingly critical, not only at the task +level, but also at the society level for better understanding of their +potential risks. Over the past years, significant efforts have been made to +examine LLMs from various perspectives. This paper presents a comprehensive +review of these evaluation methods for LLMs, focusing on three key dimensions: +what to evaluate, where to evaluate, and how to evaluate. Firstly, we provide +an overview from the perspective of evaluation tasks, encompassing general +natural language processing tasks, reasoning, medical usage, ethics, +educations, natural and social sciences, agent applications, and other areas. +Secondly, we answer the `where' and `how' questions by diving into the +evaluation methods and benchmarks, which serve as crucial components in +assessing performance of LLMs. Then, we summarize the success and failure cases +of LLMs in different tasks. Finally, we shed light on several future challenges +that lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to +researchers in the realm of LLMs evaluation, thereby aiding the development of +more proficient LLMs. Our key point is that evaluation should be treated as an +essential discipline to better assist the development of LLMs. We consistently +maintain the related open-source materials at: +https://github.com/MLGroupJLU/LLM-eval-survey. + +
+
+ comment: 26 pages; a major update to include more recent works; + https://llm-eval.github.io/ +
+
+
+
+
+ + ♻ ☆ EcomGPT: Instruction-tuning Large Language Models with Chain-of-Task + Tasks for E-commerce + + +
+ Recently, instruction-following Large Language Models (LLMs) , represented by +ChatGPT, have exhibited exceptional performance in general Natural Language +Processing (NLP) tasks. However, the unique characteristics of E-commerce data +pose significant challenges to general LLMs. An LLM tailored specifically for +E-commerce scenarios, possessing robust cross-dataset/task generalization +capabilities, is a pressing necessity. To solve this issue, in this work, we +proposed the first e-commerce instruction dataset EcomInstruct, with a total of +2.5 million instruction data. EcomInstruct scales up the data size and task +diversity by constructing atomic tasks with E-commerce basic data types, such +as product information, user reviews. Atomic tasks are defined as intermediate +tasks implicitly involved in solving a final task, which we also call +Chain-of-Task tasks. We developed EcomGPT with different parameter scales by +training the backbone model BLOOMZ with the EcomInstruct. Benefiting from the +fundamental semantic understanding capabilities acquired from the Chain-of-Task +tasks, EcomGPT exhibits excellent zero-shot generalization capabilities. +Extensive experiments and human evaluations demonstrate that EcomGPT +outperforms ChatGPT in term of cross-dataset/task generalization on E-commerce +tasks. + +
+
+ comment: Initial version of EcomGPT +
+
+
+
+
+ + ♻ ☆ Does Human Collaboration Enhance the Accuracy of Identifying + LLM-Generated Deepfake Texts? AAAI + + +
+ Advances in Large Language Models (e.g., GPT-4, LLaMA) have improved the +generation of coherent sentences resembling human writing on a large scale, +resulting in the creation of so-called deepfake texts. However, this progress +poses security and privacy concerns, necessitating effective solutions for +distinguishing deepfake texts from human-written ones. Although prior works +studied humans' ability to detect deepfake texts, none has examined whether +"collaboration" among humans improves the detection of deepfake texts. In this +study, to address this gap of understanding on deepfake texts, we conducted +experiments with two groups: (1) nonexpert individuals from the AMT platform +and (2) writing experts from the Upwork platform. The results demonstrate that +collaboration among humans can potentially improve the detection of deepfake +texts for both groups, increasing detection accuracies by 6.36% for non-experts +and 12.76% for experts, respectively, compared to individuals' detection +accuracies. We further analyze the explanations that humans used for detecting +a piece of text as deepfake text, and find that the strongest indicator of +deepfake texts is their lack of coherence and consistency. Our study provides +useful insights for future tools and framework designs to facilitate the +collaborative human detection of deepfake texts. The experiment datasets and +AMT implementations are available at: +https://github.com/huashen218/llm-deepfake-human-study.git + +
+
+ comment: Accepted at The 11th AAAI Conference on Human Computation and + Crowdsourcing (HCOMP 2023) +
+
+
+
+
+ + ♻ ☆ Scissorhands: Exploiting the Persistence of Importance Hypothesis for + LLM KV Cache Compression at Test Time + + +
+ Large language models(LLMs) have sparked a new wave of exciting AI +applications. Hosting these models at scale requires significant memory +resources. One crucial memory bottleneck for the deployment stems from the +context window. It is commonly recognized that model weights are memory hungry; +however, the size of key-value embedding stored during the generation process +(KV cache) can easily surpass the model size. The enormous size of the KV cache +puts constraints on the inference batch size, which is crucial for high +throughput inference workload. Inspired by an interesting observation of the +attention scores, we hypothesize the persistence of importance: only pivotal +tokens, which had a substantial influence at one step, will significantly +influence future generations. Based on our empirical verification and +theoretical analysis around this hypothesis, we propose Scissorhands, a system +that maintains the memory usage of the KV cache at a fixed budget without +finetuning the model. In essence, Scissorhands manages the KV cache by storing +the pivotal tokens with a higher probability. We validate that Scissorhands +reduces the inference memory usage of the KV cache by up to 5X without +compromising model quality. We further demonstrate that Scissorhands can be +combined with 4-bit quantization, traditionally used to compress model weights, +to achieve up to 20X compression. + +
+
+
+
+
+ + ♻ ☆ When Do Annotator Demographics Matter? Measuring the Influence of + Annotator Demographics with the POPQUORN Dataset + + +
+ Annotators are not fungible. Their demographics, life experiences, and +backgrounds all contribute to how they label data. However, NLP has only +recently considered how annotator identity might influence their decisions. +Here, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering, +Offensiveness, text Rewriting, and politeness rating with demographic Nuance). +POPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a +representative sample regarding sex, age, and race as the US population. +Through a series of analyses, we show that annotators' background plays a +significant role in their judgments. Further, our work shows that backgrounds +not previously considered in NLP (e.g., education), are meaningful and should +be considered. Our study suggests that understanding the background of +annotators and collecting labels from a demographically balanced pool of crowd +workers is important to reduce the bias of datasets. The dataset, annotator +background, and annotation interface are available at +https://github.com/Jiaxin-Pei/potato-prolific-dataset . + +
+
+
+
+
+ + ♻ ☆ Blockwise Parallel Transformer for Large Context Models + + +
+ Transformers have emerged as the cornerstone of state-of-the-art natural +language processing models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands posed by the +self-attention mechanism and the large feedforward network in Transformers +limit their ability to handle long sequences, thereby creating challenges for +tasks involving multiple long sequences or long-term dependencies. We present a +distinct approach, Blockwise Parallel Transformer (BPT), that leverages +blockwise computation of self-attention and feedforward network fusion to +minimize memory costs. By processing longer input sequences while maintaining +memory efficiency, BPT enables training sequences 32 times longer than vanilla +Transformers and up to 4 times longer than previous memory-efficient methods. +Extensive experiments on language modeling and reinforcement learning tasks +demonstrate the effectiveness of BPT in reducing memory requirements and +improving performance. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 120 + +
+
+
+ + ☆ Efficient Discovery and Effective Evaluation of Visual Perceptual + Similarity: A Benchmark and Beyond ICCV 2023 + + +
+ Visual similarities discovery (VSD) is an important task with broad +e-commerce applications. Given an image of a certain object, the goal of VSD is +to retrieve images of different objects with high perceptual visual similarity. +Although being a highly addressed problem, the evaluation of proposed methods +for VSD is often based on a proxy of an identification-retrieval task, +evaluating the ability of a model to retrieve different images of the same +object. We posit that evaluating VSD methods based on identification tasks is +limited, and faithful evaluation must rely on expert annotations. In this +paper, we introduce the first large-scale fashion visual similarity benchmark +dataset, consisting of more than 110K expert-annotated image pairs. Besides +this major contribution, we share insight from the challenges we faced while +curating this dataset. Based on these insights, we propose a novel and +efficient labeling procedure that can be applied to any dataset. Our analysis +examines its limitations and inductive biases, and based on these findings, we +propose metrics to mitigate those limitations. Though our primary focus lies on +visual similarity, the methodologies we present have broader applications for +discovering and evaluating perceptual similarity across various domains. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ MagicEdit: High-Fidelity and Temporally Coherent Video Editing + + +
+ In this report, we present MagicEdit, a surprisingly simple yet effective +solution to the text-guided video editing task. We found that high-fidelity and +temporally coherent video-to-video translation can be achieved by explicitly +disentangling the learning of content, structure and motion signals during +training. This is in contradict to most existing methods which attempt to +jointly model both the appearance and temporal representation within a single +framework, which we argue, would lead to degradation in per-frame quality. +Despite its simplicity, we show that MagicEdit supports various downstream +video editing tasks, including video stylization, local editing, video-MagicMix +and video outpainting. + +
+
+ comment: Project page: https://magic-edit.github.io/ +
+
+
+
+
+ + ☆ MagicAvatar: Multimodal Avatar Generation and Animation + + +
+ This report presents MagicAvatar, a framework for multimodal video generation +and animation of human avatars. Unlike most existing methods that generate +avatar-centric videos directly from multimodal inputs (e.g., text prompts), +MagicAvatar explicitly disentangles avatar video generation into two stages: +(1) multimodal-to-motion and (2) motion-to-video generation. The first stage +translates the multimodal inputs into motion/ control signals (e.g., human +pose, depth, DensePose); while the second stage generates avatar-centric video +guided by these motion signals. Additionally, MagicAvatar supports avatar +animation by simply providing a few images of the target person. This +capability enables the animation of the provided human identity according to +the specific motion derived from the first stage. We demonstrate the +flexibility of MagicAvatar through various applications, including text-guided +and video-guided avatar generation, as well as multimodal avatar animation. + +
+
+ comment: Project page: https://magic-avatar.github.io/ +
+
+
+
+
+ + ☆ CoVR: Learning Composed Video Retrieval from Web Video Captions + + +
+ Composed Image Retrieval (CoIR) has recently gained popularity as a task that +considers both text and image queries together, to search for relevant images +in a database. Most CoIR approaches require manually annotated datasets, +comprising image-text-image triplets, where the text describes a modification +from the query image to the target image. However, manual curation of CoIR +triplets is expensive and prevents scalability. In this work, we instead +propose a scalable automatic dataset creation methodology that generates +triplets given video-caption pairs, while also expanding the scope of the task +to include composed video retrieval (CoVR). To this end, we mine paired videos +with a similar caption from a large database, and leverage a large language +model to generate the corresponding modification text. Applying this +methodology to the extensive WebVid2M collection, we automatically construct +our WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we +introduce a new benchmark for CoVR with a manually annotated evaluation set, +along with baseline results. Our experiments further demonstrate that training +a CoVR model on our dataset effectively transfers to CoIR, leading to improved +state-of-the-art performance in the zero-shot setup on both the CIRR and +FashionIQ benchmarks. Our code, datasets, and models are publicly available at +https://imagine.enpc.fr/~ventural/covr. + +
+
+
+
+
+ + ☆ Total Selfie: Generating Full-Body Selfies + + +
+ We present a method to generate full-body selfies -- photos that you take of +yourself, but capturing your whole body as if someone else took the photo of +you from a few feet away. Our approach takes as input a pre-captured video of +your body, a target pose photo, and a selfie + background pair for each +location. We introduce a novel diffusion-based approach to combine all of this +information into high quality, well-composed photos of you with the desired +pose and background. + +
+
+ comment: Project page: + https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/ +
+
+
+
+
+ + ☆ Flexible Techniques for Differentiable Rendering with 3D Gaussians + + +
+ Fast, reliable shape reconstruction is an essential ingredient in many +computer vision applications. Neural Radiance Fields demonstrated that +photorealistic novel view synthesis is within reach, but was gated by +performance requirements for fast reconstruction of real scenes and objects. +Several recent approaches have built on alternative shape representations, in +particular, 3D Gaussians. We develop extensions to these renderers, such as +integrating differentiable optical flow, exporting watertight meshes and +rendering per-ray normals. Additionally, we show how two of the recent methods +are interoperable with each other. These reconstructions are quick, robust, and +easily performed on GPU or CPU. For code and visual examples, see +https://leonidk.github.io/fmb-plus + +
+
+
+
+
+ + ☆ PanoSwin: a Pano-style Swin Transformer for Panorama Understanding CVPR 2023 + + +
+ In panorama understanding, the widely used equirectangular projection (ERP) +entails boundary discontinuity and spatial distortion. It severely deteriorates +the conventional CNNs and vision Transformers on panoramas. In this paper, we +propose a simple yet effective architecture named PanoSwin to learn panorama +representations with ERP. To deal with the challenges brought by +equirectangular projection, we explore a pano-style shift windowing scheme and +novel pitch attention to address the boundary discontinuity and the spatial +distortion, respectively. Besides, based on spherical distance and Cartesian +coordinates, we adapt absolute positional embeddings and relative positional +biases for panoramas to enhance panoramic geometry information. Realizing that +planar image understanding might share some common knowledge with panorama +understanding, we devise a novel two-stage learning framework to facilitate +knowledge transfer from the planar images to panoramas. We conduct experiments +against the state-of-the-art on various panoramic tasks, i.e., panoramic object +detection, panoramic classification, and panoramic layout estimation. The +experimental results demonstrate the effectiveness of PanoSwin in panorama +understanding. + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ☆ R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras ICCV 2023 + + +
+ Dense 3D reconstruction and ego-motion estimation are key challenges in +autonomous driving and robotics. Compared to the complex, multi-modal systems +deployed today, multi-camera systems provide a simpler, low-cost alternative. +However, camera-based 3D reconstruction of complex dynamic scenes has proven +extremely difficult, as existing solutions often produce incomplete or +incoherent results. We propose R3D3, a multi-camera system for dense 3D +reconstruction and ego-motion estimation. Our approach iterates between +geometric estimation that exploits spatial-temporal information from multiple +cameras, and monocular depth refinement. We integrate multi-camera feature +correlation and dense bundle adjustment operators that yield robust geometric +depth and pose estimates. To improve reconstruction where geometric depth is +unreliable, e.g. for moving objects or low-textured regions, we introduce +learnable scene priors via a depth refinement network. We show that this design +enables a dense, consistent 3D reconstruction of challenging, dynamic outdoor +environments. Consequently, we achieve state-of-the-art dense depth prediction +on the DDAD and NuScenes benchmarks. + +
+
+ comment: Accepted to ICCV 2023. Project page is available at + https://www.vis.xyz/pub/r3d3/ +
+
+
+
+
+ + ☆ VideoCutLER: Surprisingly Simple Unsupervised Video Instance + Segmentation + + +
+ Existing approaches to unsupervised video instance segmentation typically +rely on motion estimates and experience difficulties tracking small or +divergent motions. We present VideoCutLER, a simple method for unsupervised +multi-instance video segmentation without using motion-based learning signals +like optical flow or training on natural videos. Our key insight is that using +high-quality pseudo masks and a simple video synthesis method for model +training is surprisingly sufficient to enable the resulting video model to +effectively segment and track multiple instances across video frames. We show +the first competitive unsupervised learning results on the challenging +YouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous +state-of-the-art by a large margin. VideoCutLER can also serve as a strong +pretrained model for supervised video instance segmentation tasks, exceeding +DINO by 15.9% on YouTubeVIS-2019 in terms of APvideo. + +
+
+ comment: Preprint. Code: https://github.com/facebookresearch/CutLER +
+
+
+
+
+ + ☆ 360-Degree Panorama Generation from Few Unregistered NFoV Images + + +
+ 360$^\circ$ panoramas are extensively utilized as environmental light sources +in computer graphics. However, capturing a 360$^\circ$ $\times$ 180$^\circ$ +panorama poses challenges due to the necessity of specialized and costly +equipment, and additional human resources. Prior studies develop various +learning-based generative methods to synthesize panoramas from a single Narrow +Field-of-View (NFoV) image, but they are limited in alterable input patterns, +generation quality, and controllability. To address these issues, we propose a +novel pipeline called PanoDiff, which efficiently generates complete +360$^\circ$ panoramas using one or more unregistered NFoV images captured from +arbitrary angles. Our approach has two primary components to overcome the +limitations. Firstly, a two-stage angle prediction module to handle various +numbers of NFoV inputs. Secondly, a novel latent diffusion-based panorama +generation model uses incomplete panorama and text prompts as control signals +and utilizes several geometric augmentation schemes to ensure geometric +properties in generated panoramas. Experiments show that PanoDiff achieves +state-of-the-art panoramic generation quality and high controllability, making +it suitable for applications such as content editing. + +
+
+ comment: Accepted to ACM Multimedia 2023 (MM' 23). Code is available: + https://github.com/shanemankiw/Panodiff +
+
+
+
+
+ + ☆ Video-Based Hand Pose Estimation for Remote Assessment of Bradykinesia + in Parkinson's Disease + + +
+ There is a growing interest in using pose estimation algorithms for +video-based assessment of Bradykinesia in Parkinson's Disease (PD) to +facilitate remote disease assessment and monitoring. However, the accuracy of +pose estimation algorithms in videos from video streaming services during +Telehealth appointments has not been studied. In this study, we used seven +off-the-shelf hand pose estimation models to estimate the movement of the thumb +and index fingers in videos of the finger-tapping (FT) test recorded from +Healthy Controls (HC) and participants with PD and under two different +conditions: streaming (videos recorded during a live Zoom meeting) and +on-device (videos recorded locally with high-quality cameras). The accuracy and +reliability of the models were estimated by comparing the models' output with +manual results. Three of the seven models demonstrated good accuracy for +on-device recordings, and the accuracy decreased significantly for streaming +recordings. We observed a negative correlation between movement speed and the +model's accuracy for the streaming recordings. Additionally, we evaluated the +reliability of ten movement features related to bradykinesia extracted from +video recordings of PD patients performing the FT test. While most of the +features demonstrated excellent reliability for on-device recordings, most of +the features demonstrated poor to moderate reliability for streaming +recordings. Our findings highlight the limitations of pose estimation +algorithms when applied to video recordings obtained during Telehealth visits, +and demonstrate that on-device recordings can be used for automatic +video-assessment of bradykinesia in PD. + +
+
+ comment: 12 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Neural Network-Based Histologic Remission Prediction In Ulcerative + Colitis + + +
+ BACKGROUND & AIMS: Histological remission (HR) is advocated and considered as +a new therapeutic target in ulcerative colitis (UC). Diagnosis of histologic +remission currently relies on biopsy; during this process, patients are at risk +for bleeding, infection, and post-biopsy fibrosis. In addition, histologic +response scoring is complex and time-consuming, and there is heterogeneity +among pathologists. Endocytoscopy (EC) is a novel ultra-high magnification +endoscopic technique that can provide excellent in vivo assessment of glands. +Based on the EC technique, we propose a neural network model that can assess +histological disease activity in UC using EC images to address the above +issues. The experiment results demonstrate that the proposed method can assist +patients in precise treatment and prognostic assessment. + METHODS: We construct a neural network model for UC evaluation. A total of +5105 images of 154 intestinal segments from 87 patients undergoing EC treatment +at a center in China between March 2022 and March 2023 are scored according to +the Geboes score. Subsequently, 103 intestinal segments are used as the +training set, 16 intestinal segments are used as the validation set for neural +network training, and the remaining 35 intestinal segments are used as the test +set to measure the model performance together with the validation set. + RESULTS: By treating HR as a negative category and histologic activity as a +positive category, the proposed neural network model can achieve an accuracy of +0.9, a specificity of 0.95, a sensitivity of 0.75, and an area under the curve +(AUC) of 0.81. + CONCLUSION: We develop a specific neural network model that can distinguish +histologic remission/activity in EC images of UC, which helps to accelerate +clinical histological diagnosis. + keywords: ulcerative colitis; Endocytoscopy; Geboes score; neural network. + +
+
+
+
+
+ + ☆ Comparison of automated crater catalogs for Mars from Benedix et al. + (2020) and Lee and Hogan (2021) + + +
+ Crater mapping using neural networks and other automated methods has +increased recently with automated Crater Detection Algorithms (CDAs) applied to +planetary bodies throughout the solar system. A recent publication by Benedix +et al. (2020) showed high performance at small scales compared to similar +automated CDAs but with a net positive diameter bias in many crater candidates. +I compare the publicly available catalogs from Benedix et al. (2020) and Lee & +Hogan (2021) and show that the reported performance is sensitive to the metrics +used to test the catalogs. I show how the more permissive comparison methods +indicate a higher CDA performance by allowing worse candidate craters to match +ground-truth craters. I show that the Benedix et al. (2020) catalog has a +substantial performance loss with increasing latitude and identify an image +projection issue that might cause this loss. Finally, I suggest future +applications of neural networks in generating large scientific datasets be +validated using secondary networks with independent data sources or training +methods. + +
+
+ comment: 14 pages, 6 figures. Accepted August 13th 2023 +
+
+
+
+
+ + ☆ VesselShot: Few-shot learning for cerebral blood vessel segmentation + + +
+ Angiography is widely used to detect, diagnose, and treat cerebrovascular +diseases. While numerous techniques have been proposed to segment the vascular +network from different imaging modalities, deep learning (DL) has emerged as a +promising approach. However, existing DL methods often depend on proprietary +datasets and extensive manual annotation. Moreover, the availability of +pre-trained networks specifically for medical domains and 3D volumes is +limited. To overcome these challenges, we propose a few-shot learning approach +called VesselShot for cerebrovascular segmentation. VesselShot leverages +knowledge from a few annotated support images and mitigates the scarcity of +labeled data and the need for extensive annotation in cerebral blood vessel +segmentation. We evaluated the performance of VesselShot using the publicly +available TubeTK dataset for the segmentation task, achieving a mean Dice +coefficient (DC) of 0.62(0.03). + +
+
+
+
+
+ + ☆ Compositional Semantic Mix for Domain Adaptation in Point Cloud + Segmentation + + +
+ Deep-learning models for 3D point cloud semantic segmentation exhibit limited +generalization capabilities when trained and tested on data captured with +different sensors or in varying environments due to domain shift. Domain +adaptation methods can be employed to mitigate this domain shift, for instance, +by simulating sensor noise, developing domain-agnostic generators, or training +point cloud completion networks. Often, these methods are tailored for range +view maps or necessitate multi-modal input. In contrast, domain adaptation in +the image domain can be executed through sample mixing, which emphasizes input +data manipulation rather than employing distinct adaptation modules. In this +study, we introduce compositional semantic mixing for point cloud domain +adaptation, representing the first unsupervised domain adaptation technique for +point cloud segmentation based on semantic and geometric sample mixing. We +present a two-branch symmetric network architecture capable of concurrently +processing point clouds from a source domain (e.g. synthetic) and point clouds +from a target domain (e.g. real-world). Each branch operates within one domain +by integrating selected data fragments from the other domain and utilizing +semantic information derived from source labels and target (pseudo) labels. +Additionally, our method can leverage a limited number of human point-level +annotations (semi-supervised) to further enhance performance. We assess our +approach in both synthetic-to-real and real-to-real scenarios using LiDAR +datasets and demonstrate that it significantly outperforms state-of-the-art +methods in both unsupervised and semi-supervised settings. + +
+
+ comment: TPAMI. arXiv admin note: text overlap with arXiv:2207.09778 +
+
+
+
+
+ + ☆ VoroMesh: Learning Watertight Surface Meshes with Voronoi Diagrams + + +
+ In stark contrast to the case of images, finding a concise, learnable +discrete representation of 3D surfaces remains a challenge. In particular, +while polygon meshes are arguably the most common surface representation used +in geometry processing, their irregular and combinatorial structure often make +them unsuitable for learning-based applications. In this work, we present +VoroMesh, a novel and differentiable Voronoi-based representation of watertight +3D shape surfaces. From a set of 3D points (called generators) and their +associated occupancy, we define our boundary representation through the Voronoi +diagram of the generators as the subset of Voronoi faces whose two associated +(equidistant) generators are of opposite occupancy: the resulting polygon mesh +forms a watertight approximation of the target shape's boundary. To learn the +position of the generators, we propose a novel loss function, dubbed VoroLoss, +that minimizes the distance from ground truth surface samples to the closest +faces of the Voronoi diagram which does not require an explicit construction of +the entire Voronoi diagram. A direct optimization of the Voroloss to obtain +generators on the Thingi32 dataset demonstrates the geometric efficiency of our +representation compared to axiomatic meshing algorithms and recent +learning-based mesh representations. We further use VoroMesh in a +learning-based mesh prediction task from input SDF grids on the ABC dataset, +and show comparable performance to state-of-the-art methods while guaranteeing +closed output surfaces free of self-intersections. + +
+
+
+
+
+ + ☆ MS-Net: A Multi-modal Self-supervised Network for Fine-Grained + Classification of Aircraft in SAR Images + + +
+ Synthetic aperture radar (SAR) imaging technology is commonly used to provide +24-hour all-weather earth observation. However, it still has some drawbacks in +SAR target classification, especially in fine-grained classification of +aircraft: aircrafts in SAR images have large intra-class diversity and +inter-class similarity; the number of effective samples is insufficient and +it's hard to annotate. To address these issues, this article proposes a novel +multi-modal self-supervised network (MS-Net) for fine-grained classification of +aircraft. Firstly, in order to entirely exploit the potential of multi-modal +information, a two-sided path feature extraction network (TSFE-N) is +constructed to enhance the image feature of the target and obtain the domain +knowledge feature of text mode. Secondly, a contrastive self-supervised +learning (CSSL) framework is employed to effectively learn useful +label-independent feature from unbalanced data, a similarity per-ception loss +(SPloss) is proposed to avoid network overfitting. Finally, TSFE-N is used as +the encoder of CSSL to obtain the classification results. Through a large +number of experiments, our MS-Net can effectively reduce the difficulty of +classifying similar types of aircrafts. In the case of no label, the proposed +algorithm achieves an accuracy of 88.46% for 17 types of air-craft +classification task, which has pioneering significance in the field of +fine-grained classification of aircraft in SAR images. + +
+
+
+
+
+ + ☆ A Transformer-Conditioned Neural Fields Pipeline with Polar Coordinate + Representation for Astronomical Radio Interferometric Data Reconstruction + + +
+ In radio astronomy, visibility data, which are measurements of wave signals +from radio telescopes, are transformed into images for observation of distant +celestial objects. However, these resultant images usually contain both real +sources and artifacts, due to signal sparsity and other factors. One way to +obtain cleaner images is to reconstruct samples into dense forms before +imaging. Unfortunately, existing visibility reconstruction methods may miss +some components of the frequency data, so blurred object edges and persistent +artifacts remain in the images. Furthermore, the computation overhead is high +on irregular visibility samples due to the data skew. To address these +problems, we propose PolarRec, a reconstruction method for interferometric +visibility data, which consists of a transformer-conditioned neural fields +pipeline with a polar coordinate representation. This representation matches +the way in which telescopes observe a celestial area as the Earth rotates. We +further propose Radial Frequency Loss function, using radial coordinates in the +polar coordinate system to correlate with the frequency information, to help +reconstruct complete visibility. We also group visibility sample points by +angular coordinates in the polar coordinate system, and use groups as the +granularity for subsequent encoding with a Transformer encoder. Consequently, +our method can capture the inherent characteristics of visibility data +effectively and efficiently. Our experiments demonstrate that PolarRec markedly +improves imaging results by faithfully reconstructing all frequency components +in the visibility domain while significantly reducing the computation cost. + +
+
+
+
+
+ + ☆ A Generalization of Continuous Relaxation in Structured Pruning + + +
+ Deep learning harnesses massive parallel floating-point processing to train +and evaluate large neural networks. Trends indicate that deeper and larger +neural networks with an increasing number of parameters achieve higher accuracy +than smaller neural networks. This performance improvement, which often +requires heavy compute for both training and evaluation, eventually needs to +translate well to resource-constrained hardware for practical value. Structured +pruning asserts that while large networks enable us to find solutions to +complex computer vision problems, a smaller, computationally efficient +sub-network can be derived from the large neural network that retains model +accuracy but significantly improves computational efficiency. + We generalize structured pruning with algorithms for network augmentation, +pruning, sub-network collapse and removal. In addition, we demonstrate +efficient and stable convergence up to 93% sparsity and 95% FLOPs reduction +without loss of inference accuracy using with continuous relaxation matching or +exceeding the state of the art for all structured pruning methods. The +resulting CNN executes efficiently on GPU hardware without computationally +expensive sparse matrix operations. We achieve this with routine automatable +operations on classification and segmentation problems using CIFAR-10, +ImageNet, and CityScapes datasets with the ResNet and U-NET network +architectures. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space + Reconstruction + + +
+ Segment Anything Model (SAM) has received remarkable attention as it offers a +powerful and versatile solution for object segmentation in images. However, +fine-tuning SAM for downstream segmentation tasks under different scenarios +remains a challenge, as the varied characteristics of different scenarios +naturally requires diverse model parameter spaces. Most existing fine-tuning +methods attempt to bridge the gaps among different scenarios by introducing a +set of new parameters to modify SAM's original parameter space. Unlike these +works, in this paper, we propose fine-tuning SAM efficiently by parameter space +reconstruction (SAM-PARSER), which introduce nearly zero trainable parameters +during fine-tuning. In SAM-PARSER, we assume that SAM's original parameter +space is relatively complete, so that its bases are able to reconstruct the +parameter space of a new scenario. We obtain the bases by matrix decomposition, +and fine-tuning the coefficients to reconstruct the parameter space tailored to +the new scenario by an optimal linear combination of the bases. Experimental +results show that SAM-PARSER exhibits superior segmentation performance across +various scenarios, while reducing the number of trainable parameters by +$\approx 290$ times compared with current parameter-efficient fine-tuning +methods. + +
+
+
+
+
+ + ☆ S-TREK: Sequential Translation and Rotation Equivariant Keypoints for + local feature extraction ICCV 2023 + + +
+ In this work we introduce S-TREK, a novel local feature extractor that +combines a deep keypoint detector, which is both translation and rotation +equivariant by design, with a lightweight deep descriptor extractor. We train +the S-TREK keypoint detector within a framework inspired by reinforcement +learning, where we leverage a sequential procedure to maximize a reward +directly related to keypoint repeatability. Our descriptor network is trained +following a "detect, then describe" approach, where the descriptor loss is +evaluated only at those locations where keypoints have been selected by the +already trained detector. Extensive experiments on multiple benchmarks confirm +the effectiveness of our proposed method, with S-TREK often outperforming other +state-of-the-art methods in terms of repeatability and quality of the recovered +poses, especially when dealing with in-plane rotations. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Adversarial Attacks on Foundational Vision Models + + +
+ Rapid progress is being made in developing large, pretrained, task-agnostic +foundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are +approaching the point where these models do not have to be finetuned +downstream, and can simply be used in zero-shot or with a lightweight probing +head. Critically, given the complexity of working at this scale, there is a +bottleneck where relatively few organizations in the world are executing the +training then sharing the models on centralized platforms such as HuggingFace +and torch.hub. The goal of this work is to identify several key adversarial +vulnerabilities of these models in an effort to make future designs more +robust. Intuitively, our attacks manipulate deep feature representations to +fool an out-of-distribution (OOD) detector which will be required when using +these open-world-aware models to solve closed-set downstream tasks. Our methods +reliably make in-distribution (ID) images (w.r.t. a downstream task) be +predicted as OOD and vice versa while existing in extremely +low-knowledge-assumption threat models. We show our attacks to be potent in +whitebox and blackbox settings, as well as when transferred across foundational +model types (e.g., attack DINOv2 with CLIP)! This work is only just the +beginning of a long journey towards adversarially robust foundational vision +models. + +
+
+
+
+
+ + ☆ LatentDR: Improving Model Generalization Through Sample-Aware Latent + Degradation and Restoration + + +
+ Despite significant advances in deep learning, models often struggle to +generalize well to new, unseen domains, especially when training data is +limited. To address this challenge, we propose a novel approach for +distribution-aware latent augmentation that leverages the relationships across +samples to guide the augmentation procedure. Our approach first degrades the +samples stochastically in the latent space, mapping them to augmented labels, +and then restores the samples from their corrupted versions during training. +This process confuses the classifier in the degradation step and restores the +overall class distribution of the original samples, promoting diverse +intra-class/cross-domain variability. We extensively evaluate our approach on a +diverse set of datasets and tasks, including domain generalization benchmarks +and medical imaging datasets with strong domain shift, where we show our +approach achieves significant improvements over existing methods for latent +space augmentation. We further show that our method can be flexibly adapted to +long-tail recognition tasks, demonstrating its versatility in building more +generalizable models. Code is available at +https://github.com/nerdslab/LatentDR. + +
+
+
+
+
+ + ☆ Neural Network Training Strategy to Enhance Anomaly Detection + Performance: A Perspective on Reconstruction Loss Amplification + + +
+ Unsupervised anomaly detection (UAD) is a widely adopted approach in industry +due to rare anomaly occurrences and data imbalance. A desirable characteristic +of an UAD model is contained generalization ability which excels in the +reconstruction of seen normal patterns but struggles with unseen anomalies. +Recent studies have pursued to contain the generalization capability of their +UAD models in reconstruction from different perspectives, such as design of +neural network (NN) structure and training strategy. In contrast, we note that +containing of generalization ability in reconstruction can also be obtained +simply from steep-shaped loss landscape. Motivated by this, we propose a loss +landscape sharpening method by amplifying the reconstruction loss, dubbed Loss +AMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the +reconstruction error on unseen anomalies becomes greater. Accordingly, the +anomaly detection performance is improved without any change of the NN +architecture. Our findings suggest that LAMP can be easily applied to any +reconstruction error metrics in UAD settings where the reconstruction model is +trained with anomaly-free samples only. + +
+
+ comment: 5 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Learning to Read Analog Gauges from Synthetic Data + + +
+ Manually reading and logging gauge data is time inefficient, and the effort +increases according to the number of gauges available. We present a computer +vision pipeline that automates the reading of analog gauges. We propose a +two-stage CNN pipeline that identifies the key structural components of an +analog gauge and outputs an angular reading. To facilitate the training of our +approach, a synthetic dataset is generated thus obtaining a set of realistic +analog gauges with their corresponding annotation. To validate our proposal, an +additional real-world dataset was collected with 4.813 manually curated images. +When compared against state-of-the-art methodologies, our method shows a +significant improvement of 4.55 in the average error, which is a 52% relative +improvement. The resources for this project will be made available at: +https://github.com/fuankarion/automatic-gauge-reading. + +
+
+
+
+
+ + ☆ Referring Image Segmentation Using Text Supervision ICCV 2023 + + +
+ Existing Referring Image Segmentation (RIS) methods typically require +expensive pixel-level or box-level annotations for supervision. In this paper, +we observe that the referring texts used in RIS already provide sufficient +information to localize the target object. Hence, we propose a novel +weakly-supervised RIS framework to formulate the target localization problem as +a classification process to differentiate between positive and negative text +expressions. While the referring text expressions for an image are used as +positive expressions, the referring text expressions from other images can be +used as negative expressions for this image. Our framework has three main +novelties. First, we propose a bilateral prompt method to facilitate the +classification process, by harmonizing the domain discrepancy between visual +and linguistic features. Second, we propose a calibration method to reduce +noisy background information and improve the correctness of the response maps +for target object localization. Third, we propose a positive response map +selection strategy to generate high-quality pseudo-labels from the enhanced +response maps, for training a segmentation network for RIS inference. For +evaluation, we propose a new metric to measure localization accuracy. +Experiments on four benchmarks show that our framework achieves promising +performances to existing fully-supervised RIS methods while outperforming +state-of-the-art weakly-supervised methods adapted from related areas. Code is +available at https://github.com/fawnliu/TRIS. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ SAAN: Similarity-aware attention flow network for change detection with + VHR remote sensing images + + +
+ Change detection (CD) is a fundamental and important task for monitoring the +land surface dynamics in the earth observation field. Existing deep +learning-based CD methods typically extract bi-temporal image features using a +weight-sharing Siamese encoder network and identify change regions using a +decoder network. These CD methods, however, still perform far from +satisfactorily as we observe that 1) deep encoder layers focus on irrelevant +background regions and 2) the models' confidence in the change regions is +inconsistent at different decoder stages. The first problem is because deep +encoder layers cannot effectively learn from imbalanced change categories using +the sole output supervision, while the second problem is attributed to the lack +of explicit semantic consistency preservation. To address these issues, we +design a novel similarity-aware attention flow network (SAAN). SAAN +incorporates a similarity-guided attention flow module with deeply supervised +similarity optimization to achieve effective change detection. Specifically, we +counter the first issue by explicitly guiding deep encoder layers to discover +semantic relations from bi-temporal input images using deeply supervised +similarity optimization. The extracted features are optimized to be +semantically similar in the unchanged regions and dissimilar in the changing +regions. The second drawback can be alleviated by the proposed +similarity-guided attention flow module, which incorporates similarity-guided +attention modules and attention flow mechanisms to guide the model to focus on +discriminative channels and regions. We evaluated the effectiveness and +generalization ability of the proposed method by conducting experiments on a +wide range of CD tasks. The experimental results demonstrate that our method +achieves excellent performance on several CD tasks, with discriminative +features and semantic consistency preserved. + +
+
+ comment: 15 pages,13 figures +
+
+
+
+
+ + ☆ Face Presentation Attack Detection by Excavating Causal Clues and + Adapting Embedding Statistics WACV 2024 + + +
+ Recent face presentation attack detection (PAD) leverages domain adaptation +(DA) and domain generalization (DG) techniques to address performance +degradation on unknown domains. However, DA-based PAD methods require access to +unlabeled target data, while most DG-based PAD solutions rely on a priori, +i.e., known domain labels. Moreover, most DA-/DG-based methods are +computationally intensive, demanding complex model architectures and/or +multi-stage training processes. This paper proposes to model face PAD as a +compound DG task from a causal perspective, linking it to model optimization. +We excavate the causal factors hidden in the high-level representation via +counterfactual intervention. Moreover, we introduce a class-guided MixStyle to +enrich feature-level data distribution within classes instead of focusing on +domain information. Both class-guided MixStyle and counterfactual intervention +components introduce no extra trainable parameters and negligible computational +resources. Extensive cross-dataset and analytic experiments demonstrate the +effectiveness and efficiency of our method compared to state-of-the-art PADs. +The implementation and the trained weights are publicly available. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ Semi-Supervised Learning for Visual Bird's Eye View Semantic + Segmentation + + +
+ Visual bird's eye view (BEV) semantic segmentation helps autonomous vehicles +understand the surrounding environment only from images, including static +elements (e.g., roads) and dynamic elements (e.g., vehicles, pedestrians). +However, the high cost of annotation procedures of full-supervised methods +limits the capability of the visual BEV semantic segmentation, which usually +needs HD maps, 3D object bounding boxes, and camera extrinsic matrixes. In this +paper, we present a novel semi-supervised framework for visual BEV semantic +segmentation to boost performance by exploiting unlabeled images during the +training. A consistency loss that makes full use of unlabeled data is then +proposed to constrain the model on not only semantic prediction but also the +BEV feature. Furthermore, we propose a novel and effective data augmentation +method named conjoint rotation which reasonably augments the dataset while +maintaining the geometric relationship between the front-view images and the +BEV semantic segmentation. Extensive experiments on the nuScenes and Argoverse +datasets show that our semi-supervised framework can effectively improve +prediction accuracy. To the best of our knowledge, this is the first work that +explores improving visual BEV semantic segmentation performance using unlabeled +data. The code will be publicly available. + +
+
+
+
+
+ + ☆ LAC -- Latent Action Composition for Skeleton-based Action Segmentation ICCV 2023 + + +
+ Skeleton-based action segmentation requires recognizing composable actions in +untrimmed videos. Current approaches decouple this problem by first extracting +local visual features from skeleton sequences and then processing them by a +temporal model to classify frame-wise actions. However, their performances +remain limited as the visual features cannot sufficiently express composable +actions. In this context, we propose Latent Action Composition (LAC), a novel +self-supervised framework aiming at learning from synthesized composable +motions for skeleton-based action segmentation. LAC is composed of a novel +generation module towards synthesizing new sequences. Specifically, we design a +linear latent space in the generator to represent primitive motion. New +composed motions can be synthesized by simply performing arithmetic operations +on latent representations of multiple input skeleton sequences. LAC leverages +such synthesized sequences, which have large diversity and complexity, for +learning visual representations of skeletons in both sequence and frame spaces +via contrastive learning. The resulting visual encoder has a high expressive +power and can be effectively transferred onto action segmentation tasks by +end-to-end fine-tuning without the need for additional temporal models. We +conduct a study focusing on transfer-learning and we show that representations +learned from pre-trained LAC outperform the state-of-the-art by a large margin +on TSU, Charades, PKU-MMD datasets. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ PointHPS: Cascaded 3D Human Pose and Shape Estimation from Point Clouds + + +
+ Human pose and shape estimation (HPS) has attracted increasing attention in +recent years. While most existing studies focus on HPS from 2D images or videos +with inherent depth ambiguity, there are surging need to investigate HPS from +3D point clouds as depth sensors have been frequently employed in commercial +devices. However, real-world sensory 3D points are usually noisy and +incomplete, and also human bodies could have different poses of high diversity. +To tackle these challenges, we propose a principled framework, PointHPS, for +accurate 3D HPS from point clouds captured in real-world settings, which +iteratively refines point features through a cascaded architecture. +Specifically, each stage of PointHPS performs a series of downsampling and +upsampling operations to extract and collate both local and global cues, which +are further enhanced by two novel modules: 1) Cross-stage Feature Fusion (CFF) +for multi-scale feature propagation that allows information to flow effectively +through the stages, and 2) Intermediate Feature Enhancement (IFE) for +body-aware feature aggregation that improves feature quality after each stage. +To facilitate a comprehensive study under various scenarios, we conduct our +experiments on two large-scale benchmarks, comprising i) a dataset that +features diverse subjects and actions captured by real commercial sensors in a +laboratory environment, and ii) controlled synthetic data generated with +realistic considerations such as clothed humans in crowded outdoor scenes. +Extensive experiments demonstrate that PointHPS, with its powerful point +feature extraction and processing scheme, outperforms State-of-the-Art methods +by significant margins across the board. Homepage: +https://caizhongang.github.io/projects/PointHPS/. + +
+
+
+
+
+ + ☆ Group Regression for Query Based Object Detection and Tracking SC 2023 + + +
+ Group regression is commonly used in 3D object detection to predict box +parameters of similar classes in a joint head, aiming to benefit from +similarities while separating highly dissimilar classes. For query-based +perception methods, this has, so far, not been feasible. We close this gap and +present a method to incorporate multi-class group regression, especially +designed for the 3D domain in the context of autonomous driving, into existing +attention and query-based perception approaches. We enhance a transformer based +joint object detection and tracking model with this approach, and thoroughly +evaluate its behavior and performance. For group regression, the classes of the +nuScenes dataset are divided into six groups of similar shape and prevalence, +each being regressed by a dedicated head. We show that the proposed method is +applicable to many existing transformer based perception approaches and can +bring potential benefits. The behavior of query group regression is thoroughly +analyzed in comparison to a unified regression head, e.g. in terms of +class-switching behavior and distribution of the output parameters. The +proposed method offers many possibilities for further research, such as in the +direction of deep multi-hypotheses tracking. + +
+
+ comment: Accepted for publication at the 2023 26th IEEE International + Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28, + 2023, in Bilbao, Spain +
+
+
+
+
+ + ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Medical needle tip tracking based on Optical Imaging and AI + + +
+ Deep needle insertion to a target often poses a huge challenge, requiring a +combination of specialized skills, assistive technology, and extensive +training. One of the frequently encountered medical scenarios demanding such +expertise includes the needle insertion into a femoral vessel in the groin. +After the access to the femoral vessel, various medical procedures, such as +cardiac catheterization and extracorporeal membrane oxygenation (ECMO) can be +performed. However, even with the aid of Ultrasound imaging, achieving +successful insertion can necessitate multiple attempts due to the complexities +of anatomy and tissue deformation. To address this challenge, this paper +presents an innovative technology for needle tip real-time tracking, aiming for +enhanced needle insertion guidance. Specifically, our approach revolves around +the creation of scattering imaging using an optical fiber-equipped needle, and +uses Convolutional Neural Network (CNN) based algorithms to enable real-time +estimation of the needle tip's position and orientation during insertion +procedures. The efficacy of the proposed technology was rigorously evaluated +through three experiments. The first two experiments involved rubber and bacon +phantoms to simulate groin anatomy. The positional errors averaging 2.3+1.5mm +and 2.0+1.2mm, and the orientation errors averaging 0.2+0.11rad and +0.16+0.1rad. Furthermore, the system's capabilities were validated through +experiments conducted on fresh porcine phantom mimicking more complex +anatomical structures, yielding positional accuracy results of 3.2+3.1mm and +orientational accuracy of 0.19+0.1rad. Given the average femoral arterial +radius of 4 to 5mm, the proposed system is demonstrated with a great potential +for precise needle guidance in femoral artery insertion procedures. In +addition, the findings highlight the broader potential applications of the +system in the medical field. + +
+
+
+
+
+ + ☆ Pixel-Aware Stable Diffusion for Realistic Image Super-resolution and + Personalized Stylization + + +
+ Realistic image super-resolution (Real-ISR) aims to reproduce perceptually +realistic image details from a low-quality input. The commonly used adversarial +training based Real-ISR methods often introduce unnatural visual artifacts and +fail to generate realistic textures for natural scene images. The recently +developed generative stable diffusion models provide a potential solution to +Real-ISR with pre-learned strong image priors. However, the existing methods +along this line either fail to keep faithful pixel-wise image structures or +resort to extra skipped connections to reproduce details, which requires +additional training in image space and limits their extension to other related +tasks in latent space such as image stylization. In this work, we propose a +pixel-aware stable diffusion (PASD) network to achieve robust Real-ISR as well +as personalized stylization. In specific, a pixel-aware cross attention module +is introduced to enable diffusion models perceiving image local structures in +pixel-wise level, while a degradation removal module is used to extract +degradation insensitive features to guide the diffusion process together with +image high level information. By simply replacing the base diffusion model with +a personalized one, our method can generate diverse stylized images without the +need to collect pairwise training data. PASD can be easily integrated into +existing diffusion models such as Stable Diffusion. Experiments on Real-ISR and +personalized stylization demonstrate the effectiveness of our proposed +approach. The source code and models can be found at +\url{https://github.com/yangxy/PASD}. + +
+
+
+
+
+ + ☆ Improving the performance of object detection by preserving label + distribution + + +
+ Object detection is a task that performs position identification and label +classification of objects in images or videos. The information obtained through +this process plays an essential role in various tasks in the field of computer +vision. In object detection, the data utilized for training and validation +typically originate from public datasets that are well-balanced in terms of the +number of objects ascribed to each class in an image. However, in real-world +scenarios, handling datasets with much greater class imbalance, i.e., very +different numbers of objects for each class , is much more common, and this +imbalance may reduce the performance of object detection when predicting unseen +test images. In our study, thus, we propose a method that evenly distributes +the classes in an image for training and validation, solving the class +imbalance problem in object detection. Our proposed method aims to maintain a +uniform class distribution through multi-label stratification. We tested our +proposed method not only on public datasets that typically exhibit balanced +class distribution but also on custom datasets that may have imbalanced class +distribution. We found that our proposed method was more effective on datasets +containing severe imbalance and less data. Our findings indicate that the +proposed method can be effectively used on datasets with substantially +imbalanced class distribution. + +
+
+ comment: Code is available at + https://github.com/leeheewon-01/YOLOstratifiedKFold/tree/main +
+
+
+
+
+ + ☆ Spatio-Temporal Analysis of Patient-Derived Organoid Videos Using Deep + Learning for the Prediction of Drug Efficacy + + +
+ Over the last ten years, Patient-Derived Organoids (PDOs) emerged as the most +reliable technology to generate ex-vivo tumor avatars. PDOs retain the main +characteristics of their original tumor, making them a system of choice for +pre-clinical and clinical studies. In particular, PDOs are attracting interest +in the field of Functional Precision Medicine (FPM), which is based upon an +ex-vivo drug test in which living tumor cells (such as PDOs) from a specific +patient are exposed to a panel of anti-cancer drugs. Currently, the Adenosine +Triphosphate (ATP) based cell viability assay is the gold standard test to +assess the sensitivity of PDOs to drugs. The readout is measured at the end of +the assay from a global PDO population and therefore does not capture single +PDO responses and does not provide time resolution of drug effect. To this end, +in this study, we explore for the first time the use of powerful large +foundation models for the automatic processing of PDO data. In particular, we +propose a novel imaging-based high-throughput screening method to assess +real-time drug efficacy from a time-lapse microscopy video of PDOs. The +recently proposed SAM algorithm for segmentation and DINOv2 model are adapted +in a comprehensive pipeline for processing PDO microscopy frames. Moreover, an +attention mechanism is proposed for fusing temporal and spatial features in a +multiple instance learning setting to predict ATP. We report better results +than other non-time-resolved methods, indicating that the temporality of data +is an important factor for the prediction of ATP. Extensive ablations shed +light on optimizing the experimental setting and automating the prediction both +in real-time and for forecasting. + +
+
+
+
+
+ + ☆ ExpCLIP: Bridging Text and Facial Expressions via Semantic Alignment + + +
+ The objective of stylized speech-driven facial animation is to create +animations that encapsulate specific emotional expressions. Existing methods +often depend on pre-established emotional labels or facial expression +templates, which may limit the necessary flexibility for accurately conveying +user intent. In this research, we introduce a technique that enables the +control of arbitrary styles by leveraging natural language as emotion prompts. +This technique presents benefits in terms of both flexibility and +user-friendliness. To realize this objective, we initially construct a +Text-Expression Alignment Dataset (TEAD), wherein each facial expression is +paired with several prompt-like descriptions.We propose an innovative automatic +annotation method, supported by Large Language Models (LLMs), to expedite the +dataset construction, thereby eliminating the substantial expense of manual +annotation. Following this, we utilize TEAD to train a CLIP-based model, termed +ExpCLIP, which encodes text and facial expressions into semantically aligned +style embeddings. The embeddings are subsequently integrated into the facial +animation generator to yield expressive and controllable facial animations. +Given the limited diversity of facial emotions in existing speech-driven facial +animation training data, we further introduce an effective Expression Prompt +Augmentation (EPA) mechanism to enable the animation generator to support +unprecedented richness in style control. Comprehensive experiments illustrate +that our method accomplishes expressive facial animation generation and offers +enhanced flexibility in effectively conveying the desired style. + +
+
+
+
+
+ + ☆ Data-iterative Optimization Score Model for Stable Ultra-Sparse-View CT + Reconstruction + + +
+ Score-based generative models (SGMs) have gained prominence in sparse-view CT +reconstruction for their precise sampling of complex distributions. In +SGM-based reconstruction, data consistency in the score-based diffusion model +ensures close adherence of generated samples to observed data distribution, +crucial for improving image quality. Shortcomings in data consistency +characterization manifest in three aspects. Firstly, data from the optimization +process can lead to artifacts in reconstructed images. Secondly, it often +neglects that the generation model and original data constraints are +independently completed, fragmenting unity. Thirdly, it predominantly focuses +on constraining intermediate results in the inverse sampling process, rather +than ideal real images. Thus, we propose an iterative optimization data scoring +model. This paper introduces the data-iterative optimization score-based model +(DOSM), integrating innovative data consistency into the Stochastic +Differential Equation, a valuable constraint for ultra-sparse-view CT +reconstruction. The novelty of this data consistency element lies in its sole +reliance on original measurement data to confine generation outcomes, +effectively balancing measurement data and generative model constraints. +Additionally, we pioneer an inference strategy that traces back from current +iteration results to ideal truth, enhancing reconstruction stability. We +leverage conventional iteration techniques to optimize DOSM updates. +Quantitative and qualitative results from 23 views of numerical and clinical +cardiac datasets demonstrate DOSM's superiority over other methods. Remarkably, +even with 10 views, our method achieves excellent performance. + +
+
+ comment: 11 pages, 12 figures +
+
+
+
+
+ + ☆ Graph-based Asynchronous Event Processing for Rapid Object Recognition ICCV 2021 + + +
+ Different from traditional video cameras, event cameras capture asynchronous +events stream in which each event encodes pixel location, trigger time, and the +polarity of the brightness changes. In this paper, we introduce a novel +graph-based framework for event cameras, namely SlideGCN. Unlike some recent +graph-based methods that use groups of events as input, our approach can +efficiently process data event-by-event, unlock the low latency nature of +events data while still maintaining the graph's structure internally. For fast +graph construction, we develop a radius search algorithm, which better exploits +the partial regular structure of event cloud against k-d tree based generic +methods. Experiments show that our method reduces the computational complexity +up to 100 times with respect to current graph-based methods while keeping +state-of-the-art performance on object recognition. Moreover, we verify the +superiority of event-wise processing with our method. When the state becomes +stable, we can give a prediction with high confidence, thus making an early +recognition. Project page: \url{https://zju3dv.github.io/slide_gcn/}. + +
+
+ comment: Accepted to ICCV 2021. Project Page: + https://zju3dv.github.io/slide_gcn/ +
+
+
+
+
+ + ☆ Multi-Scale and Multi-Layer Contrastive Learning for Domain + Generalization + + +
+ During the past decade, deep neural networks have led to fast-paced progress +and significant achievements in computer vision problems, for both academia and +industry. Yet despite their success, state-of-the-art image classification +approaches fail to generalize well in previously unseen visual contexts, as +required by many real-world applications. In this paper, we focus on this +domain generalization (DG) problem and argue that the generalization ability of +deep convolutional neural networks can be improved by taking advantage of +multi-layer and multi-scaled representations of the network. We introduce a +framework that aims at improving domain generalization of image classifiers by +combining both low-level and high-level features at multiple scales, enabling +the network to implicitly disentangle representations in its latent space and +learn domain-invariant attributes of the depicted objects. Additionally, to +further facilitate robust representation learning, we propose a novel objective +function, inspired by contrastive learning, which aims at constraining the +extracted representations to remain invariant under distribution shifts. We +demonstrate the effectiveness of our method by evaluating on the domain +generalization datasets of PACS, VLCS, Office-Home and NICO. Through extensive +experimentation, we show that our model is able to surpass the performance of +previous DG methods and consistently produce competitive and state-of-the-art +results in all datasets. + +
+
+ comment: Manuscript under review at: IEEE Transactions on Artificial + Intelligence +
+
+
+
+
+ + ☆ INF: Implicit Neural Fusion for LiDAR and Camera IROS 2023 + + +
+ Sensor fusion has become a popular topic in robotics. However, conventional +fusion methods encounter many difficulties, such as data representation +differences, sensor variations, and extrinsic calibration. For example, the +calibration methods used for LiDAR-camera fusion often require manual operation +and auxiliary calibration targets. Implicit neural representations (INRs) have +been developed for 3D scenes, and the volume density distribution involved in +an INR unifies the scene information obtained by different types of sensors. +Therefore, we propose implicit neural fusion (INF) for LiDAR and camera. INF +first trains a neural density field of the target scene using LiDAR frames. +Then, a separate neural color field is trained using camera images and the +trained neural density field. Along with the training process, INF both +estimates LiDAR poses and optimizes extrinsic parameters. Our experiments +demonstrate the high accuracy and stable performance of the proposed method. + +
+
+ comment: Accepted to IROS 2023. (project page: + https://ShuyiZhou495.github.io/inf-project-page/) +
+
+
+
+
+ + ☆ Steerable Conditional Diffusion for Out-of-Distribution Adaptation in + Imaging Inverse Problems + + +
+ Denoising diffusion models have emerged as the go-to framework for solving +inverse problems in imaging. A critical concern regarding these models is their +performance on out-of-distribution (OOD) tasks, which remains an under-explored +challenge. Realistic reconstructions inconsistent with the measured data can be +generated, hallucinating image features that are uniquely present in the +training dataset. To simultaneously enforce data-consistency and leverage +data-driven priors, we introduce a novel sampling framework called Steerable +Conditional Diffusion. This framework adapts the denoising network specifically +to the available measured data. Utilising our proposed method, we achieve +substantial enhancements in OOD performance across diverse imaging modalities, +advancing the robust deployment of denoising diffusion models in real-world +applications. + +
+
+
+
+
+ + ☆ Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer + and NearFarMix Augmentation WACV 2024 + + +
+ In computer vision, depth estimation is crucial for domains like robotics, +autonomous vehicles, augmented reality, and virtual reality. Integrating +semantics with depth enhances scene understanding through reciprocal +information sharing. However, the scarcity of semantic information in datasets +poses challenges. Existing convolutional approaches with limited local +receptive fields hinder the full utilization of the symbiotic potential between +depth and semantics. This paper introduces a dataset-invariant semi-supervised +strategy to address the scarcity of semantic information. It proposes the Depth +Semantics Symbiosis module, leveraging the Symbiotic Transformer for achieving +comprehensive mutual awareness by information exchange within both local and +global contexts. Additionally, a novel augmentation, NearFarMix is introduced +to combat overfitting and compensate both depth-semantic tasks by strategically +merging regions from two images, generating diverse and structurally consistent +samples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI +datasets demonstrate the superiority of our proposed techniques in indoor and +outdoor environments. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ Ensemble of Anchor-Free Models for Robust Bangla Document Layout + Segmentation + + +
+ In this research paper, we present an innovative system designed for the +purpose of segmenting the layout of Bangla documents. Our methodology involves +utilizing a sophisticated collection of YOLOv8 models, meticulously adapted for +the DL Sprint 2.0 - BUET CSE Fest 2023 Competition that centers around Bangla +document layout segmentation. Our primary focus lies in elevating various +elements of the task, including techniques like image augmentation, model +architecture, and the use of model ensembles. We intentionally lower the +quality of a subset of document images to enhance the resilience of model +training, consequently leading to an improvement in our cross-validation score. +Employing Bayesian optimization, we determine the optimal confidence and IoU +thresholds for our model ensemble. Through our approach, we successfully +showcase the effectiveness of amalgamating anchor-free models to achieve robust +layout segmentation in Bangla documents. + +
+
+ comment: 4 pages, 5 figures, 6 Tables +
+
+
+
+
+ + ☆ UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for + Temporal Forgery Localization ACM MM 2023 + + +
+ The emergence of artificial intelligence-generated content (AIGC) has raised +concerns about the authenticity of multimedia content in various fields. +However, existing research for forgery content detection has focused mainly on +binary classification tasks of complete videos, which has limited applicability +in industrial settings. To address this gap, we propose UMMAFormer, a novel +universal transformer framework for temporal forgery localization (TFL) that +predicts forgery segments with multimodal adaptation. Our approach introduces a +Temporal Feature Abnormal Attention (TFAA) module based on temporal feature +reconstruction to enhance the detection of temporal differences. We also design +a Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the +Feature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the +proposed method, we contribute a novel Temporal Video Inpainting Localization +(TVIL) dataset specifically tailored for video inpainting scenes. Our +experiments show that our approach achieves state-of-the-art performance on +benchmark datasets, including Lav-DF, TVIL, and Psynd, significantly +outperforming previous methods. The code and data are available at +https://github.com/ymhzyj/UMMAFormer/. + +
+
+ comment: 11 pages, 8 figures, 66 references. This paper has been accepted for + ACM MM 2023 +
+
+
+
+
+ + ☆ 1st Place Solution for the 5th LSVOS Challenge: Video Instance + Segmentation + + +
+ Video instance segmentation is a challenging task that serves as the +cornerstone of numerous downstream applications, including video editing and +autonomous driving. In this report, we present further improvements to the SOTA +VIS method, DVIS. First, we introduce a denoising training strategy for the +trainable tracker, allowing it to achieve more stable and accurate object +tracking in complex and long videos. Additionally, we explore the role of +visual foundation models in video instance segmentation. By utilizing a frozen +VIT-L model pre-trained by DINO v2, DVIS demonstrates remarkable performance +improvements. With these enhancements, our method achieves 57.9 AP and 56.0 AP +in the development and test phases, respectively, and ultimately ranked 1st in +the VIS track of the 5th LSVOS Challenge. The code will be available at +https://github.com/zhang-tao-whu/DVIS. + +
+
+
+
+
+ + ☆ FIRE: Food Image to REcipe generation + + +
+ Food computing has emerged as a prominent multidisciplinary field of research +in recent years. An ambitious goal of food computing is to develop end-to-end +intelligent systems capable of autonomously producing recipe information for a +food image. Current image-to-recipe methods are retrieval-based and their +success depends heavily on the dataset size and diversity, as well as the +quality of learned embeddings. Meanwhile, the emergence of powerful +attention-based vision and language models presents a promising avenue for +accurate and generalizable recipe generation, which has yet to be extensively +explored. This paper proposes FIRE, a novel multimodal methodology tailored to +recipe generation in the food computing domain, which generates the food title, +ingredients, and cooking instructions based on input food images. FIRE +leverages the BLIP model to generate titles, utilizes a Vision Transformer with +a decoder for ingredient extraction, and employs the T5 model to generate +recipes incorporating titles and ingredients as inputs. We showcase two +practical applications that can benefit from integrating FIRE with large +language model prompting: recipe customization to fit recipes to user +preferences and recipe-to-code transformation to enable automated cooking +processes. Our experimental findings validate the efficacy of our proposed +approach, underscoring its potential for future advancements and widespread +adoption in food computing. + +
+
+ comment: 5 figures, 4 tables +
+
+
+
+
+ + ☆ Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a + Light-Weight ToF Sensor ICCV 2023 + + +
+ Light-weight time-of-flight (ToF) depth sensors are compact and +cost-efficient, and thus widely used on mobile devices for tasks such as +autofocus and obstacle detection. However, due to the sparse and noisy depth +measurements, these sensors have rarely been considered for dense geometry +reconstruction. In this work, we present the first dense SLAM system with a +monocular camera and a light-weight ToF sensor. Specifically, we propose a +multi-modal implicit scene representation that supports rendering both the +signals from the RGB camera and light-weight ToF sensor which drives the +optimization by comparing with the raw sensor inputs. Moreover, in order to +guarantee successful pose tracking and reconstruction, we exploit a predicted +depth as an intermediate supervision and develop a coarse-to-fine optimization +strategy for efficient learning of the implicit representation. At last, the +temporal information is explicitly exploited to deal with the noisy signals +from light-weight ToF sensors to improve the accuracy and robustness of the +system. Experiments demonstrate that our system well exploits the signals of +light-weight ToF sensors and achieves competitive results both on camera +tracking and dense scene reconstruction. Project page: +\url{https://zju3dv.github.io/tof_slam/}. + +
+
+ comment: Accepted to ICCV 2023 (Oral). Project Page: + https://zju3dv.github.io/tof_slam/ +
+
+
+
+
+ + ☆ GKGNet: Group K-Nearest Neighbor based Graph Convolutional Network for + Multi-Label Image Recognition + + +
+ Multi-Label Image Recognition (MLIR) is a challenging task that aims to +predict multiple object labels in a single image while modeling the complex +relationships between labels and image regions. Although convolutional neural +networks and vision transformers have succeeded in processing images as regular +grids of pixels or patches, these representations are sub-optimal for capturing +irregular and discontinuous regions of interest. In this work, we present the +first fully graph convolutional model, Group K-nearest neighbor based Graph +convolutional Network (GKGNet), which models the connections between semantic +label embeddings and image patches in a flexible and unified graph structure. +To address the scale variance of different objects and to capture information +from multiple perspectives, we propose the Group KGCN module for dynamic graph +construction and message passing. Our experiments demonstrate that GKGNet +achieves state-of-the-art performance with significantly lower computational +costs on the challenging multi-label datasets, \ie MS-COCO and VOC2007 +datasets. We will release the code and models to facilitate future research in +this area. + +
+
+
+
+
+ + ☆ SuperUDF: Self-supervised UDF Estimation for Surface Reconstruction + + +
+ Learning-based surface reconstruction based on unsigned distance functions +(UDF) has many advantages such as handling open surfaces. We propose SuperUDF, +a self-supervised UDF learning which exploits a learned geometry prior for +efficient training and a novel regularization for robustness to sparse +sampling. The core idea of SuperUDF draws inspiration from the classical +surface approximation operator of locally optimal projection (LOP). The key +insight is that if the UDF is estimated correctly, the 3D points should be +locally projected onto the underlying surface following the gradient of the +UDF. Based on that, a number of inductive biases on UDF geometry and a +pre-learned geometry prior are devised to learn UDF estimation efficiently. A +novel regularization loss is proposed to make SuperUDF robust to sparse +sampling. Furthermore, we also contribute a learning-based mesh extraction from +the estimated UDFs. Extensive evaluations demonstrate that SuperUDF outperforms +the state of the arts on several public datasets in terms of both quality and +efficiency. Code will be released after accteptance. + +
+
+
+
+
+ + ☆ Improving Lesion Volume Measurements on Digital Mammograms + + +
+ Lesion volume is an important predictor for prognosis in breast cancer. We +make a step towards a more accurate lesion volume measurement on digital +mammograms by developing a model that allows to estimate lesion volumes on +processed mammograms, which are the images routinely used by radiologists in +clinical practice as well as in breast cancer screening and are available in +medical centers. Processed mammograms are obtained from raw mammograms, which +are the X-ray data coming directly from the scanner, by applying certain +vendor-specific non-linear transformations. At the core of our volume +estimation method is a physics-based algorithm for measuring lesion volumes on +raw mammograms. We subsequently extend this algorithm to processed mammograms +via a deep learning image-to-image translation model that produces synthetic +raw mammograms from processed mammograms in a multi-vendor setting. We assess +the reliability and validity of our method using a dataset of 1778 mammograms +with an annotated mass. Firstly, we investigate the correlations between lesion +volumes computed from mediolateral oblique and craniocaudal views, with a +resulting Pearson correlation of 0.93 [95% confidence interval (CI) 0.92 - +0.93]. Secondly, we compare the resulting lesion volumes from true and +synthetic raw data, with a resulting Pearson correlation of 0.998 [95% CI 0.998 +- 0.998] . Finally, for a subset of 100 mammograms with a malign mass and +concurrent MRI examination available, we analyze the agreement between lesion +volume on mammography and MRI, resulting in an intraclass correlation +coefficient of 0.81 [95% CI 0.73 - 0.87] for consistency and 0.78 [95% CI 0.66 +- 0.86] for absolute agreement. In conclusion, we developed an algorithm to +measure mammographic lesion volume that reached excellent reliability and good +validity, when using MRI as ground truth. + +
+
+
+
+
+ + ☆ MetaWeather: Few-Shot Weather-Degraded Image Restoration via Degradation + Pattern Matching + + +
+ Real-world vision tasks frequently suffer from the appearance of adverse +weather conditions including rain, fog, snow, and raindrops in captured images. +Recently, several generic methods for restoring weather-degraded images have +been proposed, aiming to remove multiple types of adverse weather effects +present in the images. However, these methods have considered weather as +discrete and mutually exclusive variables, leading to failure in generalizing +to unforeseen weather conditions beyond the scope of the training data, such as +the co-occurrence of rain, fog, and raindrops. To this end, weather-degraded +image restoration models should have flexible adaptability to the current +unknown weather condition to ensure reliable and optimal performance. The +adaptation method should also be able to cope with data scarcity for real-world +adaptation. This paper proposes MetaWeather, a few-shot weather-degraded image +restoration method for arbitrary weather conditions. For this, we devise the +core piece of MetaWeather, coined Degradation Pattern Matching Module (DPMM), +which leverages representations from a few-shot support set by matching +features between input and sample images under new weather conditions. In +addition, we build meta-knowledge with episodic meta-learning on top of our +MetaWeather architecture to provide flexible adaptability. In the meta-testing +phase, we adopt a parameter-efficient fine-tuning method to preserve the +prebuilt knowledge and avoid the overfitting problem. Experiments on the BID +Task II.A dataset show our method achieves the best performance on PSNR and +SSIM compared to state-of-the-art image restoration methods. Code is available +at (TBA). + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Attention-Guided Lidar Segmentation and Odometry Using Image-to-Point + Cloud Saliency Transfer + + +
+ LiDAR odometry estimation and 3D semantic segmentation are crucial for +autonomous driving, which has achieved remarkable advances recently. However, +these tasks are challenging due to the imbalance of points in different +semantic categories for 3D semantic segmentation and the influence of dynamic +objects for LiDAR odometry estimation, which increases the importance of using +representative/salient landmarks as reference points for robust feature +learning. To address these challenges, we propose a saliency-guided approach +that leverages attention information to improve the performance of LiDAR +odometry estimation and semantic segmentation models. Unlike in the image +domain, only a few studies have addressed point cloud saliency information due +to the lack of annotated training data. To alleviate this, we first present a +universal framework to transfer saliency distribution knowledge from color +images to point clouds, and use this to construct a pseudo-saliency dataset +(i.e. FordSaliency) for point clouds. Then, we adopt point cloud-based +backbones to learn saliency distribution from pseudo-saliency labels, which is +followed by our proposed SalLiDAR module. SalLiDAR is a saliency-guided 3D +semantic segmentation model that integrates saliency information to improve +segmentation performance. Finally, we introduce SalLONet, a self-supervised +saliency-guided LiDAR odometry network that uses the semantic and saliency +predictions of SalLiDAR to achieve better odometry estimation. Our extensive +experiments on benchmark datasets demonstrate that the proposed SalLiDAR and +SalLONet models achieve state-of-the-art performance against existing methods, +highlighting the effectiveness of image-to-LiDAR saliency knowledge transfer. +Source code will be available at https://github.com/nevrez/SalLONet. + +
+
+ comment: 33 pages, 12 Figures, 6 Tables +
+
+
+
+
+ + ☆ CPFES: Physical Fitness Evaluation Based on Canadian Agility and + Movement Skill Assessment + + +
+ In recent years, the assessment of fundamental movement skills integrated +with physical education has focused on both teaching practice and the +feasibility of assessment. The object of assessment has shifted from multiple +ages to subdivided ages, while the content of assessment has changed from +complex and time-consuming to concise and efficient. Therefore, we apply deep +learning to physical fitness evaluation, we propose a system based on the +Canadian Agility and Movement Skill Assessment (CAMSA) Physical Fitness +Evaluation System (CPFES), which evaluates children's physical fitness based on +CAMSA, and gives recommendations based on the scores obtained by CPFES to help +children grow. We have designed a landmark detection module and a pose +estimation module, and we have also designed a pose evaluation module for the +CAMSA criteria that can effectively evaluate the actions of the child being +tested. Our experimental results demonstrate the high accuracy of the proposed +system. + +
+
+
+
+
+ + ☆ Machine Unlearning Methodology base on Stochastic Teacher Network + + +
+ The rise of the phenomenon of the "right to be forgotten" has prompted +research on machine unlearning, which grants data owners the right to actively +withdraw data that has been used for model training, and requires the +elimination of the contribution of that data to the model. A simple method to +achieve this is to use the remaining data to retrain the model, but this is not +acceptable for other data owners who continue to participate in training. +Existing machine unlearning methods have been found to be ineffective in +quickly removing knowledge from deep learning models. This paper proposes using +a stochastic network as a teacher to expedite the mitigation of the influence +caused by forgotten data on the model. We performed experiments on three +datasets, and the findings demonstrate that our approach can efficiently +mitigate the influence of target data on the model within a single epoch. This +allows for one-time erasure and reconstruction of the model, and the +reconstruction model achieves the same performance as the retrained model. + +
+
+ comment: Accepted by 19th International Conference on Advanced Data Mining and + Applications. (ADMA 2023) +
+
+
+
+
+ + ☆ UniPT: Universal Parallel Tuning for Transfer Learning with Efficient + Parameter and Memory + + +
+ Fine-tuning pre-trained models has emerged as a powerful technique in +numerous domains, owing to its ability to leverage enormous pre-existing +knowledge and achieve remarkable performance on downstream tasks. However, +updating the parameters of entire networks is computationally intensive. +Although state-of-the-art parameter-efficient transfer learning (PETL) methods +significantly reduce the trainable parameters and storage demand, almost all of +them still need to back-propagate the gradients through large pre-trained +networks. This memory-extensive characteristic extremely limits the +applicability of PETL methods in real-world scenarios. To this end, we propose +a new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT). +Specifically, we facilitate the transfer process via a lightweight learnable +parallel network, which consists of two modules: 1) A parallel interaction +module that decouples the inherently sequential connections and processes the +intermediate activations detachedly of the pre-trained network. 2) A confidence +aggregation module that learns optimal strategies adaptively for integrating +cross-layer features. We evaluate UniPT with different backbones (e.g., +VSE$\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging +vision-and-language tasks (i.e., image-text retrieval, video-text retrieval, +visual question answering, compositional question answering, and visual +grounding). Extensive ablations on ten datasets have validated that our UniPT +can not only dramatically reduce memory consumption and outperform the best +memory-efficient competitor, but also achieve higher performance than existing +PETL methods in a low-memory scenario on different architectures. Our code is +publicly available at: https://github.com/Paranioar/UniPT. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Local-Global Pseudo-label Correction for Source-free Domain Adaptive + Medical Image Segmentation + + +
+ Domain shift is a commonly encountered issue in medical imaging solutions, +primarily caused by variations in imaging devices and data sources. To mitigate +this problem, unsupervised domain adaptation techniques have been employed. +However, concerns regarding patient privacy and potential degradation of image +quality have led to an increased focus on source-free domain adaptation. In +this study, we address the issue of false labels in self-training based +source-free domain adaptive medical image segmentation methods. To correct +erroneous pseudo-labels, we propose a novel approach called the local-global +pseudo-label correction (LGDA) method for source-free domain adaptive medical +image segmentation. Our method consists of two components: An offline local +context-based pseudo-label correction method that utilizes local context +similarity in image space. And an online global pseudo-label correction method +based on class prototypes, which corrects erroneously predicted pseudo-labels +by considering the relative distance between pixel-wise feature vectors and +prototype vectors. We evaluate the performance of our method on three benchmark +fundus image datasets for optic disc and cup segmentation. Our method achieves +superior performance compared to the state-of-the-art approaches, even without +using of any source data. + +
+
+ comment: 30 pages,7 figures +
+
+
+
+
+ + ☆ Direct initial orbit determination + + +
+ Initial orbit determination (IOD) is an important early step in the +processing chain that makes sense of and reconciles the multiple optical +observations of a resident space object. IOD methods generally operate on +line-of-sight (LOS) vectors extracted from images of the object, hence the LOS +vectors can be seen as discrete point samples of the raw optical measurements. +Typically, the number of LOS vectors used by an IOD method is much smaller than +the available measurements (\ie, the set of pixel intensity values), hence +current IOD methods arguably under-utilize the rich information present in the +data. In this paper, we propose a \emph{direct} IOD method called D-IOD that +fits the orbital parameters directly on the observed streak images, without +requiring LOS extraction. Since it does not utilize LOS vectors, D-IOD avoids +potential inaccuracies or errors due to an imperfect LOS extraction step. Two +innovations underpin our novel orbit-fitting paradigm: first, we introduce a +novel non-linear least-squares objective function that computes the loss +between the candidate-orbit-generated streak images and the observed streak +images. Second, the objective function is minimized with a gradient descent +approach that is embedded in our proposed optimization strategies designed for +streak images. We demonstrate the effectiveness of D-IOD on a variety of +simulated scenarios and challenging real streak images. + +
+
+ comment: 28 pages, 17 figures, Submitted to Advances in Space Research +
+
+
+
+
+ + ☆ Bridging Cross-task Protocol Inconsistency for Distillation in Dense + Object Detection ICCV2023 + + +
+ Knowledge distillation (KD) has shown potential for learning compact models +in dense object detection. However, the commonly used softmax-based +distillation ignores the absolute classification scores for individual +categories. Thus, the optimum of the distillation loss does not necessarily +lead to the optimal student classification scores for dense object detectors. +This cross-task protocol inconsistency is critical, especially for dense object +detectors, since the foreground categories are extremely imbalanced. To address +the issue of protocol differences between distillation and classification, we +propose a novel distillation method with cross-task consistent protocols, +tailored for the dense object detection. For classification distillation, we +address the cross-task protocol inconsistency problem by formulating the +classification logit maps in both teacher and student models as multiple +binary-classification maps and applying a binary-classification distillation +loss to each map. For localization distillation, we design an IoU-based +Localization Distillation Loss that is free from specific network structures +and can be compared with existing localization distillation losses. Our +proposed method is simple but effective, and experimental results demonstrate +its superiority over existing methods. Code is available at +https://github.com/TinyTigerPan/BCKD. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Unleash Model Potential: Bootstrapped Meta Self-supervised Learning NIPS + + +
+ The long-term goal of machine learning is to learn general visual +representations from a small amount of data without supervision, mimicking +three advantages of human cognition: i) no need for labels, ii) robustness to +data scarcity, and iii) learning from experience. Self-supervised learning and +meta-learning are two promising techniques to achieve this goal, but they both +only partially capture the advantages and fail to address all the problems. +Self-supervised learning struggles to overcome the drawbacks of data scarcity, +while ignoring prior knowledge that can facilitate learning and generalization. +Meta-learning relies on supervised information and suffers from a bottleneck of +insufficient learning. To address these issues, we propose a novel Bootstrapped +Meta Self-Supervised Learning (BMSSL) framework that aims to simulate the human +learning process. We first analyze the close relationship between meta-learning +and self-supervised learning. Based on this insight, we reconstruct tasks to +leverage the strengths of both paradigms, achieving advantages i and ii. +Moreover, we employ a bi-level optimization framework that alternates between +solving specific tasks with a learned ability (first level) and improving this +ability (second level), attaining advantage iii. To fully harness its power, we +introduce a bootstrapped target based on meta-gradient to make the model its +own teacher. We validate the effectiveness of our approach with comprehensive +theoretical and empirical study. + +
+
+ comment: submitted to NIPS +
+
+
+
+
+ + ☆ FaceChain: A Playground for Identity-Preserving Portrait Generation + + +
+ Recent advancement in personalized image generation have unveiled the +intriguing capability of pre-trained text-to-image models on learning identity +information from a collection of portrait images. However, existing solutions +can be vulnerable in producing truthful details, and usually suffer from +several defects such as (i) The generated face exhibit its own unique +characteristics, \ie facial shape and facial feature positioning may not +resemble key characteristics of the input, and (ii) The synthesized face may +contain warped, blurred or corrupted regions. In this paper, we present +FaceChain, a personalized portrait generation framework that combines a series +of customized image-generation model and a rich set of face-related perceptual +understanding models (\eg, face detection, deep face embedding extraction, and +facial attribute recognition), to tackle aforementioned challenges and to +generate truthful personalized portraits, with only a handful of portrait +images as input. Concretely, we inject several SOTA face models into the +generation procedure, achieving a more efficient label-tagging, +data-processing, and model post-processing compared to previous solutions, such +as DreamBooth ~\cite{ruiz2023dreambooth} , InstantBooth +~\cite{shi2023instantbooth} , or other LoRA-only approaches ~\cite{hu2021lora} +. Through the development of FaceChain, we have identified several potential +directions to accelerate development of Face/Human-Centric AIGC research and +application. We have designed FaceChain as a framework comprised of pluggable +components that can be easily adjusted to accommodate different styles and +personalized needs. We hope it can grow to serve the burgeoning needs from the +communities. FaceChain is open-sourced under Apache-2.0 license at +\url{https://github.com/modelscope/facechain}. + +
+
+ comment: This is an ongoing work that will be consistently refined and + improved upon +
+
+
+
+
+ + ☆ HoloFusion: Towards Photo-realistic 3D Generative Modeling ICCV 2023 + + +
+ Diffusion-based image generators can now produce high-quality and diverse +samples, but their success has yet to fully translate to 3D generation: +existing diffusion methods can either generate low-resolution but 3D consistent +outputs, or detailed 2D views of 3D objects but with potential structural +defects and lacking view consistency or realism. We present HoloFusion, a +method that combines the best of these approaches to produce high-fidelity, +plausible, and diverse 3D samples while learning from a collection of +multi-view 2D images only. The method first generates coarse 3D samples using a +variant of the recently proposed HoloDiffusion generator. Then, it +independently renders and upsamples a large number of views of the coarse 3D +model, super-resolves them to add detail, and distills those into a single, +high-fidelity implicit 3D representation, which also ensures view consistency +of the final renders. The super-resolution network is trained as an integral +part of HoloFusion, end-to-end, and the final distillation uses a new sampling +scheme to capture the space of super-resolved signals. We compare our method +against existing baselines, including DreamFusion, Get3D, EG3D, and +HoloDiffusion, and achieve, to the best of our knowledge, the most realistic +results on the challenging CO3Dv2 dataset. + +
+
+ comment: ICCV 2023 conference; project page at: + https://holodiffusion.github.io/holofusion +
+
+
+
+
+ + ☆ Entropy-based Guidance of Deep Neural Networks for Accelerated + Convergence and Improved Performance + + +
+ Neural networks have dramatically increased our capacity to learn from large, +high-dimensional datasets across innumerable disciplines. However, their +decisions are not easily interpretable, their computational costs are high, and +building and training them are uncertain processes. To add structure to these +efforts, we derive new mathematical results to efficiently measure the changes +in entropy as fully-connected and convolutional neural networks process data, +and introduce entropy-based loss terms. Experiments in image compression and +image classification on benchmark datasets demonstrate these losses guide +neural networks to learn rich latent data representations in fewer dimensions, +converge in fewer training epochs, and achieve better test metrics. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Auto-Prompting SAM for Mobile Friendly 3D Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) has rapidly been adopted for segmenting a +wide range of natural images. However, recent studies have indicated that SAM +exhibits subpar performance on 3D medical image segmentation tasks. In addition +to the domain gaps between natural and medical images, disparities in the +spatial arrangement between 2D and 3D images, the substantial computational +burden imposed by powerful GPU servers, and the time-consuming manual prompt +generation impede the extension of SAM to a broader spectrum of medical image +segmentation applications. To address these challenges, in this work, we +introduce a novel method, AutoSAM Adapter, designed specifically for 3D +multi-organ CT-based segmentation. We employ parameter-efficient adaptation +techniques in developing an automatic prompt learning paradigm to facilitate +the transformation of the SAM model's capabilities to 3D medical image +segmentation, eliminating the need for manually generated prompts. Furthermore, +we effectively transfer the acquired knowledge of the AutoSAM Adapter to other +lightweight models specifically tailored for 3D medical image analysis, +achieving state-of-the-art (SOTA) performance on medical image segmentation +tasks. Through extensive experimental evaluation, we demonstrate the AutoSAM +Adapter as a critical foundation for effectively leveraging the emerging +ability of foundation models in 2D natural image segmentation for 3D medical +image segmentation. + +
+
+ comment: 9 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Application of Quantum Pre-Processing Filter for Binary Image + Classification with Small Samples + + +
+ Over the past few years, there has been significant interest in Quantum +Machine Learning (QML) among researchers, as it has the potential to transform +the field of machine learning. Several models that exploit the properties of +quantum mechanics have been developed for practical applications. In this +study, we investigated the application of our previously proposed quantum +pre-processing filter (QPF) to binary image classification. We evaluated the +QPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits +and alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic +sign images). Similar to our previous multi-class classification results, the +application of QPF improved the binary image classification accuracy using +neural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8% +to 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from +93.5% to 92.0%. We then applied QPF in cases using a smaller number of training +and testing samples, i.e. 80 and 20 samples per class, respectively. In order +to derive statistically stable results, we conducted the experiment with 100 +trials choosing randomly different training and testing samples and averaging +the results. The result showed that the application of QPF did not improve the +image classification accuracy against MNIST and EMNIST but improved it against +CIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively. +Further research will be conducted as part of future work to investigate the +potential of QPF to assess the scalability of the proposed approach to larger +and complex datasets. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Automated Conversion of Music Videos into Lyric Videos + + +
+ Musicians and fans often produce lyric videos, a form of music videos that +showcase the song's lyrics, for their favorite songs. However, making such +videos can be challenging and time-consuming as the lyrics need to be added in +synchrony and visual harmony with the video. Informed by prior work and close +examination of existing lyric videos, we propose a set of design guidelines to +help creators make such videos. Our guidelines ensure the readability of the +lyric text while maintaining a unified focus of attention. We instantiate these +guidelines in a fully automated pipeline that converts an input music video +into a lyric video. We demonstrate the robustness of our pipeline by generating +lyric videos from a diverse range of input sources. A user study shows that +lyric videos generated by our pipeline are effective in maintaining text +readability and unifying the focus of attention. + +
+
+
+
+
+ + ☆ Maturity-Aware Active Learning for Semantic Segmentation with + Hierarchically-Adaptive Sample Assessment BMVC 2023 + + +
+ Active Learning (AL) for semantic segmentation is challenging due to heavy +class imbalance and different ways of defining "sample" (pixels, areas, etc.), +leaving the interpretation of the data distribution ambiguous. We propose +"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL +method that benefits from a hierarchical approach to define a multiview data +distribution, which takes into account the different "sample" definitions +jointly, hence able to select the most impactful segmentation pixels with +comprehensive understanding. MADBAL also features a novel uncertainty +formulation, where AL supporting modules are included to sense the features' +maturity whose weighted influence continuously contributes to the uncertainty +detection. In this way, MADBAL makes significant performance leaps even in the +early AL stage, hence reducing the training burden significantly. It +outperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as +verified in our extensive experiments. + +
+
+ comment: Accepted to the 34th British Machine Vision Conference (BMVC 2023) +
+
+
+
+
+ + ☆ BIT: Bi-Level Temporal Modeling for Efficient Supervised Action + Segmentation + + +
+ We address the task of supervised action segmentation which aims to partition +a video into non-overlapping segments, each representing a different action. +Recent works apply transformers to perform temporal modeling at the +frame-level, which suffer from high computational cost and cannot well capture +action dependencies over long temporal horizons. To address these issues, we +propose an efficient BI-level Temporal modeling (BIT) framework that learns +explicit action tokens to represent action segments, in parallel performs +temporal modeling on frame and action levels, while maintaining a low +computational cost. Our model contains (i) a frame branch that uses convolution +to learn frame-level relationships, (ii) an action branch that uses transformer +to learn action-level dependencies with a small set of action tokens and (iii) +cross-attentions to allow communication between the two branches. We apply and +extend a set-prediction objective to allow each action token to represent one +or multiple action segments, thus can avoid learning a large number of tokens +over long videos with many segments. Thanks to the design of our action branch, +we can also seamlessly leverage textual transcripts of videos (when available) +to help action segmentation by using them to initialize the action tokens. We +evaluate our model on four video datasets (two egocentric and two third-person) +for action segmentation with and without transcripts, showing that BIT +significantly improves the state-of-the-art accuracy with much lower +computational cost (30 times faster) compared to existing transformer-based +methods. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ RobustCLEVR: A Benchmark and Framework for Evaluating Robustness in + Object-centric Learning + + +
+ Object-centric representation learning offers the potential to overcome +limitations of image-level representations by explicitly parsing image scenes +into their constituent components. While image-level representations typically +lack robustness to natural image corruptions, the robustness of object-centric +methods remains largely untested. To address this gap, we present the +RobustCLEVR benchmark dataset and evaluation framework. Our framework takes a +novel approach to evaluating robustness by enabling the specification of causal +dependencies in the image generation process grounded in expert knowledge and +capable of producing a wide range of image corruptions unattainable in existing +robustness evaluations. Using our framework, we define several causal models of +the image corruption process which explicitly encode assumptions about the +causal relationships and distributions of each corruption type. We generate +dataset variants for each causal model on which we evaluate state-of-the-art +object-centric methods. Overall, we find that object-centric methods are not +inherently robust to image corruptions. Our causal evaluation approach exposes +model sensitivities not observed using conventional evaluation processes, +yielding greater insight into robustness differences across algorithms. Lastly, +while conventional robustness evaluations view corruptions as +out-of-distribution, we use our causal framework to show that even training on +in-distribution image corruptions does not guarantee increased model +robustness. This work provides a step towards more concrete and substantiated +understanding of model performance and deterioration under complex corruption +processes of the real-world. + +
+
+
+
+
+ + ☆ When hard negative sampling meets supervised contrastive learning + + +
+ State-of-the-art image models predominantly follow a two-stage strategy: +pre-training on large datasets and fine-tuning with cross-entropy loss. Many +studies have shown that using cross-entropy can result in sub-optimal +generalisation and stability. While the supervised contrastive loss addresses +some limitations of cross-entropy loss by focusing on intra-class similarities +and inter-class differences, it neglects the importance of hard negative +mining. We propose that models will benefit from performance improvement by +weighting negative samples based on their dissimilarity to positive +counterparts. In this paper, we introduce a new supervised contrastive learning +objective, SCHaNe, which incorporates hard negative sampling during the +fine-tuning phase. Without requiring specialized architectures, additional +data, or extra computational resources, experimental results indicate that +SCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various +benchmarks, with significant gains of up to $3.32\%$ in few-shot learning +settings and $3.41\%$ in full dataset fine-tuning. Importantly, our proposed +objective sets a new state-of-the-art for base models on ImageNet-1k, achieving +an 86.14\% accuracy. Furthermore, we demonstrate that the proposed objective +yields better embeddings and explains the improved effectiveness observed in +our experiments. + +
+
+
+
+
+ + ☆ Evaluation of Key Spatiotemporal Learners for Print Track Anomaly + Classification Using Melt Pool Image Streams + + +
+ Recent applications of machine learning in metal additive manufacturing (MAM) +have demonstrated significant potential in addressing critical barriers to the +widespread adoption of MAM technology. Recent research in this field emphasizes +the importance of utilizing melt pool signatures for real-time defect +prediction. While high-quality melt pool image data holds the promise of +enabling precise predictions, there has been limited exploration into the +utilization of cutting-edge spatiotemporal models that can harness the inherent +transient and sequential characteristics of the additive manufacturing process. +This research introduces and puts into practice some of the leading deep +spatiotemporal learning models that can be adapted for the classification of +melt pool image streams originating from various materials, systems, and +applications. Specifically, it investigates two-stream networks comprising +spatial and temporal streams, a recurrent spatial network, and a factorized 3D +convolutional neural network. The capacity of these models to generalize when +exposed to perturbations in melt pool image data is examined using data +perturbation techniques grounded in real-world process scenarios. The +implemented architectures demonstrate the ability to capture the spatiotemporal +features of melt pool image sequences. However, among these models, only the +Kinetics400 pre-trained SlowFast network, categorized as a two-stream network, +exhibits robust generalization capabilities in the presence of data +perturbations. + +
+
+ comment: This work has been accepted to IFAC for publication under a Creative + Commons Licence CC-BY-NC-ND +
+
+
+
+
+ + ☆ SynthDistill: Face Recognition with Knowledge Distillation from + Synthetic Data + + +
+ State-of-the-art face recognition networks are often computationally +expensive and cannot be used for mobile applications. Training lightweight face +recognition models also requires large identity-labeled datasets. Meanwhile, +there are privacy and ethical concerns with collecting and using large face +recognition datasets. While generating synthetic datasets for training face +recognition models is an alternative option, it is challenging to generate +synthetic data with sufficient intra-class variations. In addition, there is +still a considerable gap between the performance of models trained on real and +synthetic data. In this paper, we propose a new framework (named SynthDistill) +to train lightweight face recognition models by distilling the knowledge of a +pretrained teacher face recognition model using synthetic data. We use a +pretrained face generator network to generate synthetic face images and use the +synthesized images to learn a lightweight student network. We use synthetic +face images without identity labels, mitigating the problems in the intra-class +variation generation of synthetic datasets. Instead, we propose a novel dynamic +sampling strategy from the intermediate latent space of the face generator +network to include new variations of the challenging images while further +exploring new face images in the training batch. The results on five different +face recognition datasets demonstrate the superiority of our lightweight model +compared to models trained on previous synthetic datasets, achieving a +verification accuracy of 99.52% on the LFW dataset with a lightweight network. +The results also show that our proposed framework significantly reduces the gap +between training with real and synthetic data. The source code for replicating +the experiments is publicly released. + +
+
+ comment: Accepted in the IEEE International Joint Conference on Biometrics + (IJCB 2023) +
+
+
+
+
+ + ☆ NSF: Neural Surface Fields for Human Modeling from Monocular Depth ICCV 2023 + + +
+ Obtaining personalized 3D animatable avatars from a monocular camera has +several real world applications in gaming, virtual try-on, animation, and +VR/XR, etc. However, it is very challenging to model dynamic and fine-grained +clothing deformations from such sparse data. Existing methods for modeling 3D +humans from depth data have limitations in terms of computational efficiency, +mesh coherency, and flexibility in resolution and topology. For instance, +reconstructing shapes using implicit functions and extracting explicit meshes +per frame is computationally expensive and cannot ensure coherent meshes across +frames. Moreover, predicting per-vertex deformations on a pre-designed human +template with a discrete surface lacks flexibility in resolution and topology. +To overcome these limitations, we propose a novel method `\keyfeature: Neural +Surface Fields' for modeling 3D clothed humans from monocular depth. NSF +defines a neural field solely on the base surface which models a continuous and +flexible displacement field. NSF can be adapted to the base surface with +different resolution and topology without retraining at inference time. +Compared to existing approaches, our method eliminates the expensive per-frame +surface extraction while maintaining mesh coherency, and is capable of +reconstructing meshes with arbitrary resolution without retraining. To foster +research in this direction, we release our code in project page at: +https://yuxuan-xue.com/nsf. + +
+
+ comment: Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf +
+
+
+
+
+ + ☆ The Interstate-24 3D Dataset: a new benchmark for 3D multi-camera + vehicle tracking + + +
+ This work presents a novel video dataset recorded from overlapping highway +traffic cameras along an urban interstate, enabling multi-camera 3D object +tracking in a traffic monitoring context. Data is released from 3 scenes +containing video from at least 16 cameras each, totaling 57 minutes in length. +877,000 3D bounding boxes and corresponding object tracklets are fully and +accurately annotated for each camera field of view and are combined into a +spatially and temporally continuous set of vehicle trajectories for each scene. +Lastly, existing algorithms are combined to benchmark a number of 3D +multi-camera tracking pipelines on the dataset, with results indicating that +the dataset is challenging due to the difficulty of matching objects traveling +at high speeds across cameras and heavy object occlusion, potentially for +hundreds of frames, during congested traffic. This work aims to enable the +development of accurate and automatic vehicle trajectory extraction algorithms, +which will play a vital role in understanding impacts of autonomous vehicle +technologies on the safety and efficiency of traffic. + +
+
+
+
+
+ + ☆ Continual Learning with Dynamic Sparse Training: Exploring Algorithms + for Effective Model Updates + + +
+ Continual learning (CL) refers to the ability of an intelligent system to +sequentially acquire and retain knowledge from a stream of data with as little +computational overhead as possible. To this end; regularization, replay, +architecture, and parameter isolation approaches were introduced to the +literature. Parameter isolation using a sparse network which enables to +allocate distinct parts of the neural network to different tasks and also +allows to share of parameters between tasks if they are similar. Dynamic Sparse +Training (DST) is a prominent way to find these sparse networks and isolate +them for each task. This paper is the first empirical study investigating the +effect of different DST components under the CL paradigm to fill a critical +research gap and shed light on the optimal configuration of DST for CL if it +exists. Therefore, we perform a comprehensive study in which we investigate +various DST components to find the best topology per task on well-known +CIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our +primary focus is to evaluate the performance of various DST criteria, rather +than the process of mask selection. We found that, at a low sparsity level, +Erdos-Renyi Kernel (ERK) initialization utilizes the backbone more efficiently +and allows to effectively learn increments of tasks. At a high sparsity level, +however, uniform initialization demonstrates more reliable and robust +performance. In terms of growth strategy; performance is dependent on the +defined initialization strategy, and the extent of sparsity. Finally, +adaptivity within DST components is a promising way for better continual +learners. + +
+
+
+
+
+ + ☆ CLNeRF: Continual Learning Meets NeRF ICCV 2023 + + +
+ Novel view synthesis aims to render unseen views given a set of calibrated +images. In practical applications, the coverage, appearance or geometry of the +scene may change over time, with new images continuously being captured. +Efficiently incorporating such continuous change is an open challenge. Standard +NeRF benchmarks only involve scene coverage expansion. To study other practical +scene changes, we propose a new dataset, World Across Time (WAT), consisting of +scenes that change in appearance and geometry over time. We also propose a +simple yet effective method, CLNeRF, which introduces continual learning (CL) +to Neural Radiance Fields (NeRFs). CLNeRF combines generative replay and the +Instant Neural Graphics Primitives (NGP) architecture to effectively prevent +catastrophic forgetting and efficiently update the model when new data arrives. +We also add trainable appearance and geometry embeddings to NGP, allowing a +single compact model to handle complex scene changes. Without the need to store +historical images, CLNeRF trained sequentially over multiple scans of a +changing scene performs on-par with the upper bound model trained on all scans +at once. Compared to other CL baselines CLNeRF performs much better across +standard benchmarks and WAT. The source code, and the WAT dataset are available +at https://github.com/IntelLabs/CLNeRF. Video presentation is available at: +https://youtu.be/nLRt6OoDGq0?si=8yD6k-8MMBJInQPs + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action + Detection ICCV + + +
+ The goal of spatial-temporal action detection is to determine the time and +place where each person's action occurs in a video and classify the +corresponding action category. Most of the existing methods adopt +fully-supervised learning, which requires a large amount of training data, +making it very difficult to achieve zero-shot learning. In this paper, we +propose to utilize a pre-trained visual-language model to extract the +representative image and text features, and model the relationship between +these features through different interaction modules to obtain the interaction +feature. In addition, we use this feature to prompt each label to obtain more +appropriate text features. Finally, we calculate the similarity between the +interaction feature and the text feature for each label to determine the action +category. Our experiments on J-HMDB and UCF101-24 datasets demonstrate that the +proposed interaction module and prompting make the visual-language features +better aligned, thus achieving excellent accuracy for zero-shot spatio-temporal +action detection. The code will be available at +https://github.com/webber2933/iCLIP. + +
+
+ comment: Accepted by ICCVW 2023 (What is Next in Multimodal Foundation + Models?) +
+
+
+
+
+ + ♻ ☆ Free Lunch for Gait Recognition: A Novel Relation Descriptor + + +
+ Gait recognition is to seek correct matches for query individuals by their +unique walking patterns. However, current methods focus solely on extracting +individual-specific features, overlooking inter-personal relationships. In this +paper, we propose a novel $\textbf{Relation Descriptor}$ that captures not only +individual features but also relations between test gaits and pre-selected +anchored gaits. Specifically, we reinterpret classifier weights as anchored +gaits and compute similarity scores between test features and these anchors, +which re-expresses individual gait features into a similarity relation +distribution. In essence, the relation descriptor offers a holistic perspective +that leverages the collective knowledge stored within the classifier's weights, +emphasizing meaningful patterns and enhancing robustness. Despite its +potential, relation descriptor poses dimensionality challenges since its +dimension depends on the training set's identity count. To address this, we +propose the Farthest Anchored-gait Selection to identify the most +discriminative anchored gaits and an Orthogonal Regularization to increase +diversity within anchored gaits. Compared to individual-specific features +extracted from the backbone, our relation descriptor can boost the performances +nearly without any extra costs. We evaluate the effectiveness of our method on +the popular GREW, Gait3D, CASIA-B, and OU-MVLP, showing that our method +consistently outperforms the baselines and achieves state-of-the-art +performances. + +
+
+ comment: Add new figures and fix some typos +
+
+
+
+
+ + ♻ ☆ Cross-domain Federated Object Detection ICME 2023 + + +
+ Detection models trained by one party (including server) may face severe +performance degradation when distributed to other users (clients). Federated +learning can enable multi-party collaborative learning without leaking client +data. In this paper, we focus on a special cross-domain scenario in which the +server has large-scale labeled data and multiple clients only have a small +amount of labeled data; meanwhile, there exist differences in data +distributions among the clients. In this case, traditional federated learning +methods can't help a client learn both the global knowledge of all participants +and its own unique knowledge. To make up for this limitation, we propose a +cross-domain federated object detection framework, named FedOD. The proposed +framework first performs the federated training to obtain a public global +aggregated model through multi-teacher distillation, and sends the aggregated +model back to each client for fine-tuning its personalized local model. After a +few rounds of communication, on each client we can perform weighted ensemble +inference on the public global model and the personalized local model. We +establish a federated object detection dataset which has significant background +differences and instance differences based on multiple public autonomous +driving datasets, and then conduct extensive experiments on the dataset. The +experimental results validate the effectiveness of the proposed method. + +
+
+ comment: ICME 2023 +
+
+
+
+
+ + ♻ ☆ Domain Generalization with Correlated Style Uncertainty WACV2024 + + +
+ Domain generalization (DG) approaches intend to extract domain invariant +features that can lead to a more robust deep learning model. In this regard, +style augmentation is a strong DG method taking advantage of instance-specific +feature statistics containing informative style characteristics to synthetic +novel domains. While it is one of the state-of-the-art methods, prior works on +style augmentation have either disregarded the interdependence amongst distinct +feature channels or have solely constrained style augmentation to linear +interpolation. To address these research gaps, in this work, we introduce a +novel augmentation approach, named Correlated Style Uncertainty (CSU), +surpassing the limitations of linear interpolation in style statistic space and +simultaneously preserving vital correlation information. Our method's efficacy +is established through extensive experimentation on diverse cross-domain +computer vision and medical imaging classification tasks: PACS, Office-Home, +and Camelyon17 datasets, and the Duke-Market1501 instance retrieval task. The +results showcase a remarkable improvement margin over existing state-of-the-art +techniques. The source code is available https://github.com/freshman97/CSU. + +
+
+ comment: Accepted by WACV2024, camera ready version +
+
+
+
+
+ + ♻ ☆ SoGAR: Self-supervised Spatiotemporal Attention-based Social Group + Activity Recognition + + +
+ This paper introduces a novel approach to Social Group Activity Recognition +(SoGAR) using Self-supervised Transformers network that can effectively utilize +unlabeled video data. To extract spatio-temporal information, we created local +and global views with varying frame rates. Our self-supervised objective +ensures that features extracted from contrasting views of the same video were +consistent across spatio-temporal domains. Our proposed approach is efficient +in using transformer-based encoders to alleviate the weakly supervised setting +of group activity recognition. By leveraging the benefits of transformer +models, our approach can model long-term relationships along spatio-temporal +dimensions. Our proposed SoGAR method achieved state-of-the-art results on +three group activity recognition benchmarks, namely JRDB-PAR, NBA, and +Volleyball datasets, surpassing the current numbers in terms of F1-score, MCA, +and MPCA metrics. + +
+
+ comment: Under review for PR journal; 32 pages, 7 figures. arXiv admin note: + text overlap with arXiv:2303.12149 +
+
+
+
+
+ + ♻ ☆ SPARTAN: Self-supervised Spatiotemporal Transformers Approach to Group + Activity Recognition CVPR + + +
+ In this paper, we propose a new, simple, and effective Self-supervised +Spatio-temporal Transformers (SPARTAN) approach to Group Activity Recognition +(GAR) using unlabeled video data. Given a video, we create local and global +Spatio-temporal views with varying spatial patch sizes and frame rates. The +proposed self-supervised objective aims to match the features of these +contrasting views representing the same video to be consistent with the +variations in spatiotemporal domains. To the best of our knowledge, the +proposed mechanism is one of the first works to alleviate the weakly supervised +setting of GAR using the encoders in video transformers. Furthermore, using the +advantage of transformer models, our proposed approach supports long-term +relationship modeling along spatio-temporal dimensions. The proposed SPARTAN +approach performs well on two group activity recognition benchmarks, including +NBA and Volleyball datasets, by surpassing the state-of-the-art results by a +significant margin in terms of MCA and MPCA metrics. + +
+
+ comment: Accepted to CVPRW 2023; 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Three-stage binarization of color document images based on discrete + wavelet transform and generative adversarial networks + + +
+ The efficient segmentation of foreground text information from the background +in degraded color document images is a critical challenge in the preservation +of ancient manuscripts. The imperfect preservation of ancient manuscripts over +time has led to various types of degradation, such as staining, yellowing, and +ink seepage, significantly affecting image binarization results. This work +proposes a three-stage method using Generative Adversarial Networks (GAN) for +enhancing and binarizing degraded color document images through Discrete +Wavelet Transform (DWT). Stage-1 involves applying DWT and retaining the +Low-Low (LL) subband images for image enhancement. In Stage-2, the original +input image is divided into four single-channel images (Red, Green, Blue, and +Gray), and each is trained with independent adversarial networks to extract +color foreground information. In Stage-3, the output image from Stage-2 and the +original input image are used to train independent adversarial networks for +document binarization, enabling the integration of global and local features. +The experimental results demonstrate that our proposed method outperforms other +classic and state-of-the-art (SOTA) methods on the Document Image Binarization +Contest (DIBCO) datasets. We have released our implementation code at +https://github.com/abcpp12383/ThreeStageBinarization. + +
+
+
+
+
+ + ♻ ☆ PiClick: Picking the desired mask in click-based interactive + segmentation + + +
+ Click-based interactive segmentation aims to generate target masks via human +clicking, which facilitates efficient pixel-level annotation and image editing. +In such a task, target ambiguity remains a problem hindering the accuracy and +efficiency of segmentation. That is, in scenes with rich context, one click may +correspond to multiple potential targets, while most previous interactive +segmentors only generate a single mask and fail to deal with target ambiguity. +In this paper, we propose a novel interactive segmentation network named +PiClick, to yield all potentially reasonable masks and suggest the most +plausible one for the user. Specifically, PiClick utilizes a Transformer-based +architecture to generate all potential target masks by mutually interactive +mask queries. Moreover, a Target Reasoning module is designed in PiClick to +automatically suggest the user-desired mask from all candidates, relieving +target ambiguity and extra-human efforts. Extensive experiments on 9 +interactive segmentation datasets demonstrate PiClick performs favorably +against previous state-of-the-arts considering the segmentation results. +Moreover, we show that PiClick effectively reduces human efforts in annotating +and picking the desired masks. To ease the usage and inspire future research, +we release the source code of PiClick together with a plug-and-play annotation +tool at https://github.com/cilinyan/PiClick. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Secure & Private Federated Neuroimaging + + +
+ The amount of biomedical data continues to grow rapidly. However, collecting +data from multiple sites for joint analysis remains challenging due to +security, privacy, and regulatory concerns. To overcome this challenge, we use +Federated Learning, which enables distributed training of neural network models +over multiple data sources without sharing data. Each site trains the neural +network over its private data for some time, then shares the neural network +parameters (i.e., weights, gradients) with a Federation Controller, which in +turn aggregates the local models, sends the resulting community model back to +each site, and the process repeats. Our Federated Learning architecture, +MetisFL, provides strong security and privacy. First, sample data never leaves +a site. Second, neural network parameters are encrypted before transmission and +the global neural model is computed under fully-homomorphic encryption. +Finally, we use information-theoretic methods to limit information leakage from +the neural model to prevent a curious site from performing model inversion or +membership attacks. We present a thorough evaluation of the performance of +secure, private federated learning in neuroimaging tasks, including for +predicting Alzheimer's disease and estimating BrainAGE from magnetic resonance +imaging (MRI) studies, in challenging, heterogeneous federated environments +where sites have different amounts of data and statistical distributions. + +
+
+ comment: 18 pages, 13 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ EgoLoc: Revisiting 3D Object Localization from Egocentric Videos with + Visual Queries ICCV 2023 + + +
+ With the recent advances in video and 3D understanding, novel 4D +spatio-temporal methods fusing both concepts have emerged. Towards this +direction, the Ego4D Episodic Memory Benchmark proposed a task for Visual +Queries with 3D Localization (VQ3D). Given an egocentric video clip and an +image crop depicting a query object, the goal is to localize the 3D position of +the center of that query object with respect to the camera pose of a query +frame. Current methods tackle the problem of VQ3D by unprojecting the 2D +localization results of the sibling task Visual Queries with 2D Localization +(VQ2D) into 3D predictions. Yet, we point out that the low number of camera +poses caused by camera re-localization from previous VQ3D methods severally +hinders their overall success rate. In this work, we formalize a pipeline (we +dub EgoLoc) that better entangles 3D multiview geometry with 2D object +retrieval from egocentric videos. Our approach involves estimating more robust +camera poses and aggregating multi-view 3D displacements by leveraging the 2D +detection confidence, which enhances the success rate of object queries and +leads to a significant improvement in the VQ3D baseline performance. +Specifically, our approach achieves an overall success rate of up to 87.12%, +which sets a new state-of-the-art result in the VQ3D task. We provide a +comprehensive empirical analysis of the VQ3D task and existing solutions, and +highlight the remaining challenges in VQ3D. The code is available at +https://github.com/Wayne-Mai/EgoLoc. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual + Tracking and Segmentation ICCV2023 + + +
+ Tracking any given object(s) spatially and temporally is a common purpose in +Visual Object Tracking (VOT) and Video Object Segmentation (VOS). Joint +tracking and segmentation have been attempted in some studies but they often +lack full compatibility of both box and mask in initialization and prediction, +and mainly focus on single-object scenarios. To address these limitations, this +paper proposes a Multi-object Mask-box Integrated framework for unified +Tracking and Segmentation, dubbed MITS. Firstly, the unified identification +module is proposed to support both box and mask reference for initialization, +where detailed object information is inferred from boxes or directly retained +from masks. Additionally, a novel pinpoint box predictor is proposed for +accurate multi-object box prediction, facilitating target-oriented +representation learning. All target objects are processed simultaneously from +encoding to propagation and decoding, as a unified pipeline for VOT and VOS. +Experimental results show MITS achieves state-of-the-art performance on both +VOT and VOS benchmarks. Notably, MITS surpasses the best prior VOT competitor +by around 6% on the GOT-10k test set, and significantly improves the +performance of box initialization on VOS benchmarks. The code is available at +https://github.com/yoxu515/MITS. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ♻ ☆ CarDD: A New Dataset for Vision-based Car Damage Detection + + +
+ Automatic car damage detection has attracted significant attention in the car +insurance business. However, due to the lack of high-quality and publicly +available datasets, we can hardly learn a feasible model for car damage +detection. To this end, we contribute with Car Damage Detection (CarDD), the +first public large-scale dataset designed for vision-based car damage detection +and segmentation. Our CarDD contains 4,000 highresolution car damage images +with over 9,000 well-annotated instances of six damage categories. We detail +the image collection, selection, and annotation processes, and present a +statistical dataset analysis. Furthermore, we conduct extensive experiments on +CarDD with state-of-the-art deep methods for different tasks and provide +comprehensive analyses to highlight the specialty of car damage detection. +CarDD dataset and the source code are available at +https://cardd-ustc.github.io. + +
+
+ comment: 13 pages, 10 figures, full-length paper for Transactions on + Intelligent Transportation Systems (2023) +
+
+
+
+
+ + ♻ ☆ Self-supervised pseudo-colorizing of masked cells + + +
+ Self-supervised learning, which is strikingly referred to as the dark matter +of intelligence, is gaining more attention in biomedical applications of deep +learning. In this work, we introduce a novel self-supervision objective for the +analysis of cells in biomedical microscopy images. We propose training deep +learning models to pseudo-colorize masked cells. We use a physics-informed +pseudo-spectral colormap that is well suited for colorizing cell topology. Our +experiments reveal that approximating semantic segmentation by +pseudo-colorization is beneficial for subsequent fine-tuning on cell detection. +Inspired by the recent success of masked image modeling, we additionally mask +out cell parts and train to reconstruct these parts to further enrich the +learned representations. We compare our pre-training method with +self-supervised frameworks including contrastive learning (SimCLR), masked +autoencoders (MAEs), and edge-based self-supervision. We build upon our +previous work and train hybrid models for cell detection, which contain both +convolutional and vision transformer modules. Our pre-training method can +outperform SimCLR, MAE-like masked image modeling, and edge-based +self-supervision when pre-training on a diverse set of six fluorescence +microscopy datasets. Code is available at: +https://github.com/roydenwa/pseudo-colorize-masked-cells + +
+
+ comment: 14 pages, 3 figures; Published in PLOS ONE +
+
+
+
+
+ + ♻ ☆ Graph-based Topology Reasoning for Driving Scenes + + +
+ Understanding the road genome is essential to realize autonomous driving. +This highly intelligent problem contains two aspects - the connection +relationship of lanes, and the assignment relationship between lanes and +traffic elements, where a comprehensive topology reasoning method is vacant. On +one hand, previous map learning techniques struggle in deriving lane +connectivity with segmentation or laneline paradigms; or prior lane +topology-oriented approaches focus on centerline detection and neglect the +interaction modeling. On the other hand, the traffic element to lane assignment +problem is limited in the image domain, leaving how to construct the +correspondence from two views an unexplored challenge. To address these issues, +we present TopoNet, the first end-to-end framework capable of abstracting +traffic knowledge beyond conventional perception tasks. To capture the driving +scene topology, we introduce three key designs: (1) an embedding module to +incorporate semantic knowledge from 2D elements into a unified feature space; +(2) a curated scene graph neural network to model relationships and enable +feature interaction inside the network; (3) instead of transmitting messages +arbitrarily, a scene knowledge graph is devised to differentiate prior +knowledge from various types of the road genome. We evaluate TopoNet on the +challenging scene understanding benchmark, OpenLane-V2, where our approach +outperforms all previous works by a great margin on all perceptual and +topological metrics. The code is released at +https://github.com/OpenDriveLab/TopoNet + +
+
+
+
+
+ + ♻ ☆ No Fear of Classifier Biases: Neural Collapse Inspired Federated + Learning with Synthetic and Fixed Classifier ICCV 2023 + + +
+ Data heterogeneity is an inherent challenge that hinders the performance of +federated learning (FL). Recent studies have identified the biased classifiers +of local models as the key bottleneck. Previous attempts have used classifier +calibration after FL training, but this approach falls short in improving the +poor feature representations caused by training-time classifier biases. +Resolving the classifier bias dilemma in FL requires a full understanding of +the mechanisms behind the classifier. Recent advances in neural collapse have +shown that the classifiers and feature prototypes under perfect training +scenarios collapse into an optimal structure called simplex equiangular tight +frame (ETF). Building on this neural collapse insight, we propose a solution to +the FL's classifier bias problem by utilizing a synthetic and fixed ETF +classifier during training. The optimal classifier structure enables all +clients to learn unified and optimal feature representations even under +extremely heterogeneous data. We devise several effective modules to better +adapt the ETF structure in FL, achieving both high generalization and +personalization. Extensive experiments demonstrate that our method achieves +state-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Multimodal Motion Conditioned Diffusion Model for Skeleton-based Video + Anomaly Detection ICCV2023 + + +
+ Anomalies are rare and anomaly detection is often therefore framed as +One-Class Classification (OCC), i.e. trained solely on normalcy. Leading OCC +techniques constrain the latent representations of normal motions to limited +volumes and detect as abnormal anything outside, which accounts satisfactorily +for the openset'ness of anomalies. But normalcy shares the same openset'ness +property since humans can perform the same action in several ways, which the +leading techniques neglect. We propose a novel generative model for video +anomaly detection (VAD), which assumes that both normality and abnormality are +multimodal. We consider skeletal representations and leverage state-of-the-art +diffusion probabilistic models to generate multimodal future human poses. We +contribute a novel conditioning on the past motion of people and exploit the +improved mode coverage capabilities of diffusion processes to generate +different-but-plausible future motions. Upon the statistical aggregation of +future modes, an anomaly is detected when the generated set of motions is not +pertinent to the actual future. We validate our model on 4 established +benchmarks: UBnormal, HR-UBnormal, HR-STC, and HR-Avenue, with extensive +experiments surpassing state-of-the-art results. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Efficient Video Action Detection with Token Dropout and Context + Refinement + + +
+ Streaming video clips with large-scale video tokens impede vision +transformers (ViTs) for efficient recognition, especially in video action +detection where sufficient spatiotemporal representations are required for +precise actor identification. In this work, we propose an end-to-end framework +for efficient video action detection (EVAD) based on vanilla ViTs. Our EVAD +consists of two specialized designs for video action detection. First, we +propose a spatiotemporal token dropout from a keyframe-centric perspective. In +a video clip, we maintain all tokens from its keyframe, preserve tokens +relevant to actor motions from other frames, and drop out the remaining tokens +in this clip. Second, we refine scene context by leveraging remaining tokens +for better recognizing actor identities. The region of interest (RoI) in our +action detector is expanded into temporal domain. The captured spatiotemporal +actor identity representations are refined via scene context in a decoder with +the attention mechanism. These two designs make our EVAD efficient while +maintaining accuracy, which is validated on three benchmark datasets (i.e., +AVA, UCF101-24, JHMDB). Compared to the vanilla ViT backbone, our EVAD reduces +the overall GFLOPs by 43% and improves real-time inference speed by 40% with no +performance degradation. Moreover, even at similar computational costs, our +EVAD can improve the performance by 1.1 mAP with higher resolution inputs. Code +is available at https://github.com/MCG-NJU/EVAD. + +
+
+ comment: technical report +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive + Review + + +
+ The advent of deep learning has brought a revolutionary transformation to +image denoising techniques. However, the persistent challenge of acquiring +noise-clean pairs for supervised methods in real-world scenarios remains +formidable, necessitating the exploration of more practical self-supervised +image denoising. This paper focuses on self-supervised image denoising methods +that offer effective solutions to address this challenge. Our comprehensive +review thoroughly analyzes the latest advancements in self-supervised image +denoising approaches, categorizing them into three distinct classes: General +methods, Blind Spot Network (BSN)-based methods, and Transformer-based methods. +For each class, we provide a concise theoretical analysis along with their +practical applications. To assess the effectiveness of these methods, we +present both quantitative and qualitative experimental results on various +datasets, utilizing classical algorithms as benchmarks. Additionally, we +critically discuss the current limitations of these methods and propose +promising directions for future research. By offering a detailed overview of +recent developments in self-supervised image denoising, this review serves as +an invaluable resource for researchers and practitioners in the field, +facilitating a deeper understanding of this emerging domain and inspiring +further advancements. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Predicting Shape Development: a Riemannian Method + + +
+ Predicting the future development of an anatomical shape from a single +baseline observation is a challenging task. But it can be essential for +clinical decision-making. Research has shown that it should be tackled in +curved shape spaces, as (e.g., disease-related) shape changes frequently expose +nonlinear characteristics. We thus propose a novel prediction method that +encodes the whole shape in a Riemannian shape space. It then learns a simple +prediction technique founded on hierarchical statistical modeling of +longitudinal training data. When applied to predict the future development of +the shape of the right hippocampus under Alzheimer's disease and to human body +motion, it outperforms deep learning-supported variants as well as +state-of-the-art. + +
+
+ comment: new experiment with human motion data; fixed vertex-assignment bug in + the prediction of the varifold-based method +
+
+
+
+
+ + ♻ ☆ Efficient Decision-based Black-box Patch Attacks on Video Recognition + + +
+ Although Deep Neural Networks (DNNs) have demonstrated excellent performance, +they are vulnerable to adversarial patches that introduce perceptible and +localized perturbations to the input. Generating adversarial patches on images +has received much attention, while adversarial patches on videos have not been +well investigated. Further, decision-based attacks, where attackers only access +the predicted hard labels by querying threat models, have not been well +explored on video models either, even if they are practical in real-world video +recognition scenes. The absence of such studies leads to a huge gap in the +robustness assessment for video models. To bridge this gap, this work first +explores decision-based patch attacks on video models. We analyze that the huge +parameter space brought by videos and the minimal information returned by +decision-based models both greatly increase the attack difficulty and query +burden. To achieve a query-efficient attack, we propose a spatial-temporal +differential evolution (STDE) framework. First, STDE introduces target videos +as patch textures and only adds patches on keyframes that are adaptively +selected by temporal difference. Second, STDE takes minimizing the patch area +as the optimization objective and adopts spatialtemporal mutation and crossover +to search for the global optimum without falling into the local optimum. +Experiments show STDE has demonstrated state-of-the-art performance in terms of +threat, efficiency and imperceptibility. Hence, STDE has the potential to be a +powerful tool for evaluating the robustness of video recognition models. + +
+
+
+
+
+ + ♻ ☆ Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in + First Trimester 3D Ultrasound + + +
+ Segmentation and spatial alignment of ultrasound (US) imaging data acquired +in the in first trimester are crucial for monitoring human embryonic growth and +development throughout this crucial period of life. Current approaches are +either manual or semi-automatic and are therefore very time-consuming and prone +to errors. To automate these tasks, we propose a multi-atlas framework for +automatic segmentation and spatial alignment of the embryo using deep learning +with minimal supervision. Our framework learns to register the embryo to an +atlas, which consists of the US images acquired at a range of gestational age +(GA), segmented and spatially aligned to a predefined standard orientation. +From this, we can derive the segmentation of the embryo and put the embryo in +standard orientation. US images acquired at 8+0 till 12+6 weeks GA were used +and eight subjects were selected as atlas. We evaluated different fusion +strategies to incorporate multiple atlases: 1) training the framework using +atlas images from a single subject, 2) training the framework with data of all +available atlases and 3) ensembling of the frameworks trained per subject. To +evaluate the performance, we calculated the Dice score over the test set. We +found that training the framework using all available atlases outperformed +ensembling and gave similar results compared to the best of all frameworks +trained on a single subject. Furthermore, we found that selecting images from +the four atlases closest in GA out of all available atlases, regardless of the +individual quality, gave the best results with a median Dice score of 0.72. We +conclude that our framework can accurately segment and spatially align the +embryo in first trimester 3D US images and is robust for the variation in +quality that existed in the available atlases. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html +
+
+
+
+
+ + ♻ ☆ GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling + for Multi-view 3D Understanding + + +
+ Multi-view camera-based 3D detection is a challenging problem in computer +vision. Recent works leverage a pretrained LiDAR detection model to transfer +knowledge to a camera-based student network. However, we argue that there is a +major domain gap between the LiDAR BEV features and the camera-based BEV +features, as they have different characteristics and are derived from different +sources. In this paper, we propose Geometry Enhanced Masked Image Modeling +(GeoMIM) to transfer the knowledge of the LiDAR model in a pretrain-finetune +paradigm for improving the multi-view camera-based 3D detection. GeoMIM is a +multi-camera vision transformer with Cross-View Attention (CVA) blocks that +uses LiDAR BEV features encoded by the pretrained BEV model as learning +targets. During pretraining, GeoMIM's decoder has a semantic branch completing +dense perspective-view features and the other geometry branch reconstructing +dense perspective-view depth maps. The depth branch is designed to be +camera-aware by inputting the camera's parameters for better transfer +capability. Extensive results demonstrate that GeoMIM outperforms existing +methods on nuScenes benchmark, achieving state-of-the-art performance for +camera-based 3D object detection and 3D segmentation. Code and pretrained +models are available at https://github.com/Sense-X/GeoMIM. + +
+
+ comment: Release code: https://github.com/Sense-X/GeoMIM +
+
+
+
+
+ + ♻ ☆ DDH-QA: A Dynamic Digital Humans Quality Assessment Database + + +
+ In recent years, large amounts of effort have been put into pushing forward +the real-world application of dynamic digital human (DDH). However, most +current quality assessment research focuses on evaluating static 3D models and +usually ignores motion distortions. Therefore, in this paper, we construct a +large-scale dynamic digital human quality assessment (DDH-QA) database with +diverse motion content as well as multiple distortions to comprehensively study +the perceptual quality of DDHs. Both model-based distortion (noise, +compression) and motion-based distortion (binding error, motion unnaturalness) +are taken into consideration. Ten types of common motion are employed to drive +the DDHs and a total of 800 DDHs are generated in the end. Afterward, we render +the video sequences of the distorted DDHs as the evaluation media and carry out +a well-controlled subjective experiment. Then a benchmark experiment is +conducted with the state-of-the-art video quality assessment (VQA) methods and +the experimental results show that existing VQA methods are limited in +assessing the perceptual loss of DDHs. + +
+
+
+
+
+ + ♻ ☆ Region-Aware Pretraining for Open-Vocabulary Object Detection with + Vision Transformers CVPR 2023 + + +
+ We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a +contrastive image-text pretraining recipe to bridge the gap between image-level +pretraining and open-vocabulary object detection. At the pretraining phase, we +propose to randomly crop and resize regions of positional embeddings instead of +using the whole image positional embeddings. This better matches the use of +positional embeddings at region-level in the detection finetuning phase. In +addition, we replace the common softmax cross entropy loss in contrastive +learning with focal loss to better learn the informative yet difficult +examples. Finally, we leverage recent advances in novel object proposals to +improve open-vocabulary detection finetuning. We evaluate our full model on the +LVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer. +RO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best +existing approach by +7.8 points in addition to competitive zero-shot transfer +detection. Surprisingly, RO-ViT improves the image-level representation as well +and achieves the state of the art on 9 out of 12 metrics on COCO and Flickr +image-text retrieval benchmarks, outperforming competitive approaches with +larger models. + +
+
+ comment: CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B + result +
+
+
+
+
+ + ♻ ☆ Predicting Class Distribution Shift for Reliable Domain Adaptive Object + Detection + + +
+ Unsupervised Domain Adaptive Object Detection (UDA-OD) uses unlabelled data +to improve the reliability of robotic vision systems in open-world +environments. Previous approaches to UDA-OD based on self-training have been +effective in overcoming changes in the general appearance of images. However, +shifts in a robot's deployment environment can also impact the likelihood that +different objects will occur, termed class distribution shift. Motivated by +this, we propose a framework for explicitly addressing class distribution shift +to improve pseudo-label reliability in self-training. Our approach uses the +domain invariance and contextual understanding of a pre-trained joint vision +and language model to predict the class distribution of unlabelled data. By +aligning the class distribution of pseudo-labels with this prediction, we +provide weak supervision of pseudo-label accuracy. To further account for low +quality pseudo-labels early in self-training, we propose an approach to +dynamically adjust the number of pseudo-labels per image based on model +confidence. Our method outperforms state-of-the-art approaches on several +benchmarks, including a 4.7 mAP improvement when facing challenging class +distribution shift. + +
+
+
+
+
+ + ♻ ☆ Instance-incremental Scene Graph Generation from Real-world Point Clouds + via Normalizing Flows + + +
+ This work introduces a new task of instance-incremental scene graph +generation: Given a scene of the point cloud, representing it as a graph and +automatically increasing novel instances. A graph denoting the object layout of +the scene is finally generated. It is an important task since it helps to guide +the insertion of novel 3D objects into a real-world scene in vision-based +applications like augmented reality. It is also challenging because the +complexity of the real-world point cloud brings difficulties in learning object +layout experiences from the observation data (non-empty rooms with labeled +semantics). We model this task as a conditional generation problem and propose +a 3D autoregressive framework based on normalizing flows (3D-ANF) to address +it. First, we represent the point cloud as a graph by extracting the label +semantics and contextual relationships. Next, a model based on normalizing +flows is introduced to map the conditional generation of graphic elements into +the Gaussian process. The mapping is invertible. Thus, the real-world +experiences represented in the observation data can be modeled in the training +phase, and novel instances can be autoregressively generated based on the +Gaussian process in the testing phase. To evaluate the performance of our +method sufficiently, we implement this new task on the indoor benchmark dataset +3DSSG-O27R16 and our newly proposed graphical dataset of outdoor scenes GPL3D. +Experiments show that our method generates reliable novel graphs from the +real-world point cloud and achieves state-of-the-art performance on the +datasets. + +
+
+ comment: Accepted by IEEE TCSVT. The supplementary material is available in + the media column of the journal version of the article +
+
+
+
+
+ + ♻ ☆ GINA-3D: Learning to Generate Implicit Neural Assets in the Wild CVPR 2023 + + +
+ Modeling the 3D world from sensor data for simulation is a scalable way of +developing testing and validation environments for robotic learning problems +such as autonomous driving. However, manually creating or re-creating +real-world-like environments is difficult, expensive, and not scalable. Recent +generative model techniques have shown promising progress to address such +challenges by learning 3D assets using only plentiful 2D images -- but still +suffer limitations as they leverage either human-curated image datasets or +renderings from manually-created synthetic 3D environments. In this paper, we +introduce GINA-3D, a generative model that uses real-world driving data from +camera and LiDAR sensors to create realistic 3D implicit neural assets of +diverse vehicles and pedestrians. Compared to the existing image datasets, the +real-world driving setting poses new challenges due to occlusions, +lighting-variations and long-tail distributions. GINA-3D tackles these +challenges by decoupling representation learning and generative modeling into +two stages with a learned tri-plane latent structure, inspired by recent +advances in generative modeling of images. To evaluate our approach, we +construct a large-scale object-centric dataset containing over 1.2M images of +vehicles and pedestrians from the Waymo Open Dataset, and a new set of 80K +images of long-tail instances such as construction equipment, garbage trucks, +and cable cars. We compare our model with existing approaches and demonstrate +that it achieves state-of-the-art performance in quality and diversity for both +generated images and geometries. + +
+
+ comment: Accepted by CVPR 2023; Our WOD-ObjectAsset can be accessed through + waymo.com/open +
+
+
+
+
+ + ♻ ☆ $BT^2$: Backward-compatible Training with Basis Transformation + + +
+ Modern retrieval system often requires recomputing the representation of +every piece of data in the gallery when updating to a better representation +model. This process is known as backfilling and can be especially costly in the +real world where the gallery often contains billions of samples. Recently, +researchers have proposed the idea of Backward Compatible Training (BCT) where +the new representation model can be trained with an auxiliary loss to make it +backward compatible with the old representation. In this way, the new +representation can be directly compared with the old representation, in +principle avoiding the need for any backfilling. However, followup work shows +that there is an inherent tradeoff where a backward compatible representation +model cannot simultaneously maintain the performance of the new model itself. +This paper reports our ``not-so-surprising'' finding that adding extra +dimensions to the representation can help here. However, we also found that +naively increasing the dimension of the representation did not work. To deal +with this, we propose Backward-compatible Training with a novel Basis +Transformation ($BT^2$). A basis transformation (BT) is basically a learnable +set of parameters that applies an orthonormal transformation. Such a +transformation possesses an important property whereby the original information +contained in its input is retained in its output. We show in this paper how a +BT can be utilized to add only the necessary amount of additional dimensions. +We empirically verify the advantage of $BT^2$ over other state-of-the-art +methods in a wide range of settings. We then further extend $BT^2$ to other +challenging yet more practical settings, including significant change in model +architecture (CNN to Transformers), modality change, and even a series of +updates in the model architecture mimicking the evolution of deep learning +models. + +
+
+ comment: iccv2023 camera ready +
+
+
+
+
+ + ♻ ☆ One-shot Ultra-high-Resolution Generative Adversarial Network That + Synthesizes 16K Images On A Single GPU + + +
+ We propose a one-shot ultra-high-resolution generative adversarial network +(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images +from a single training image and is trainable on a single consumer GPU. OUR-GAN +generates an initial image that is visually plausible and varied in shape at +low resolution, and then gradually increases the resolution by adding detail +through super-resolution. Since OUR-GAN learns from a real +ultra-high-resolution (UHR) image, it can synthesize large shapes with fine +details and long-range coherence, which is difficult to achieve with +conventional generative models that rely on the patch distribution learned from +relatively small images. OUR-GAN can synthesize high-quality 16K images with +12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR +image part by part through seamless subregion-wise super-resolution. +Additionally, OUR-GAN improves visual coherence while maintaining diversity by +applying vertical positional convolution. In experiments on the ST4K and RAISE +datasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity +compared with the baseline one-shot synthesis models. To the best of our +knowledge, OUR-GAN is the first one-shot image synthesizer that generates +non-repetitive UHR images on a single consumer GPU. The synthesized image +samples are presented at https://our-gan.github.io. + +
+
+ comment: 36 pages, 26 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+
+
+
+ + ♻ ☆ Enhancing Breast Cancer Risk Prediction by Incorporating Prior Images MICCAI 2023 + + +
+ Recently, deep learning models have shown the potential to predict breast +cancer risk and enable targeted screening strategies, but current models do not +consider the change in the breast over time. In this paper, we present a new +method, PRIME+, for breast cancer risk prediction that leverages prior +mammograms using a transformer decoder, outperforming a state-of-the-art risk +prediction method that only uses mammograms from a single time point. We +validate our approach on a dataset with 16,113 exams and further demonstrate +that it effectively captures patterns of changes from prior mammograms, such as +changes in breast density, resulting in improved short-term and long-term +breast cancer risk prediction. Experimental results show that our model +achieves a statistically significant improvement in performance over the +state-of-the-art based model, with a C-index increase from 0.68 to 0.73 (p < +0.05) on held-out test sets. + +
+
+ comment: MICCAI 2023 accepted +
+
+
+
+
+ + ♻ ☆ Bayesian Optimization Meets Self-Distillation ICCV 2023 + + +
+ Bayesian optimization (BO) has contributed greatly to improving model +performance by suggesting promising hyperparameter configurations iteratively +based on observations from multiple training trials. However, only partial +knowledge (i.e., the measured performances of trained models and their +hyperparameter configurations) from previous trials is transferred. On the +other hand, Self-Distillation (SD) only transfers partial knowledge learned by +the task model itself. To fully leverage the various knowledge gained from all +training trials, we propose the BOSS framework, which combines BO and SD. BOSS +suggests promising hyperparameter configurations through BO and carefully +selects pre-trained models from previous trials for SD, which are otherwise +abandoned in the conventional BO process. BOSS achieves significantly better +performance than both BO and SD in a wide range of tasks including general +image classification, learning with noisy labels, semi-supervised learning, and +medical image analysis tasks. + +
+
+ comment: ICCV 2023 accepted +
+
+
+
+
+ + ♻ ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model + + +
+ Low light enhancement has gained increasing importance with the rapid +development of visual creation and editing. However, most existing enhancement +algorithms are designed to homogeneously increase the brightness of images to a +pre-defined extent, limiting the user experience. To address this issue, we +propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a +novel diffusion framework to provide users with rich controllability. Built +with a conditional diffusion model, we introduce an illumination embedding to +let users control their desired brightness level. Additionally, we incorporate +the Segment-Anything Model (SAM) to enable user-friendly region +controllability, where users can click on objects to specify the regions they +wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves +competitive performance regarding quantitative metrics, qualitative results, +and versatile controllability. Project page: +https://yuyangyin.github.io/CLEDiffusion/ + +
+
+ comment: Accepted In Proceedings of the 31st ACM International Conference on + Multimedia (MM' 23) +
+
+
+
+
+ + ♻ ☆ TRansPose: Large-Scale Multispectral Dataset for Transparent Object + + +
+ Transparent objects are encountered frequently in our daily lives, yet +recognizing them poses challenges for conventional vision sensors due to their +unique material properties, not being well perceived from RGB or depth cameras. +Overcoming this limitation, thermal infrared cameras have emerged as a +solution, offering improved visibility and shape information for transparent +objects. In this paper, we present TRansPose, the first large-scale +multispectral dataset that combines stereo RGB-D, thermal infrared (TIR) +images, and object poses to promote transparent object research. The dataset +includes 99 transparent objects, encompassing 43 household items, 27 recyclable +trashes, 29 chemical laboratory equivalents, and 12 non-transparent objects. It +comprises a vast collection of 333,819 images and 4,000,056 annotations, +providing instance-level segmentation masks, ground-truth poses, and completed +depth information. The data was acquired using a FLIR A65 thermal infrared +(TIR) camera, two Intel RealSense L515 RGB-D cameras, and a Franka Emika Panda +robot manipulator. Spanning 87 sequences, TRansPose covers various challenging +real-life scenarios, including objects filled with water, diverse lighting +conditions, heavy clutter, non-transparent or translucent containers, objects +in plastic bags, and multi-stacked objects. TRansPose dataset can be accessed +from the following link: https://sites.google.com/view/transpose-dataset + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Hiding Visual Information via Obfuscating Adversarial Perturbations + + +
+ Growing leakage and misuse of visual information raise security and privacy +concerns, which promotes the development of information protection. Existing +adversarial perturbations-based methods mainly focus on the de-identification +against deep learning models. However, the inherent visual information of the +data has not been well protected. In this work, inspired by the Type-I +adversarial attack, we propose an adversarial visual information hiding method +to protect the visual privacy of data. Specifically, the method generates +obfuscating adversarial perturbations to obscure the visual information of the +data. Meanwhile, it maintains the hidden objectives to be correctly predicted +by models. In addition, our method does not modify the parameters of the +applied model, which makes it flexible for different scenarios. Experimental +results on the recognition and classification tasks demonstrate that the +proposed method can effectively hide visual information and hardly affect the +performances of models. The code is available in the supplementary material. + +
+
+
+
+
+ + ♻ ☆ Deep Generative Models on 3D Representations: A Survey + + +
+ Generative models aim to learn the distribution of observed data by +generating new instances. With the advent of neural networks, deep generative +models, including variational autoencoders (VAEs), generative adversarial +networks (GANs), and diffusion models (DMs), have progressed remarkably in +synthesizing 2D images. Recently, researchers started to shift focus from 2D to +3D space, considering that 3D data is more closely aligned with our physical +world and holds immense practical potential. However, unlike 2D images, which +possess an inherent and efficient representation (\textit{i.e.}, a pixel grid), +representing 3D data poses significantly greater challenges. Ideally, a robust +3D representation should be capable of accurately modeling complex shapes and +appearances while being highly efficient in handling high-resolution data with +high processing speeds and low memory requirements. Regrettably, existing 3D +representations, such as point clouds, meshes, and neural fields, often fail to +satisfy all of these requirements simultaneously. In this survey, we thoroughly +review the ongoing developments of 3D generative models, including methods that +employ 2D and 3D supervision. Our analysis centers on generative models, with a +particular focus on the representations utilized in this context. We believe +our survey will help the community to track the field's evolution and to spark +innovative ideas to propel progress towards solving this challenging task. + +
+
+ comment: Github: https://github.com/justimyhxu/awesome-3D-generation +
+
+
+
+
+ + ♻ ☆ DynamicISP: Dynamically Controlled Image Signal Processor for Image + Recognition ICCV2023 + + +
+ Image Signal Processors (ISPs) play important roles in image recognition +tasks as well as in the perceptual quality of captured images. In most cases, +experts make a lot of effort to manually tune many parameters of ISPs, but the +parameters are sub-optimal. In the literature, two types of techniques have +been actively studied: a machine learning-based parameter tuning technique and +a DNN-based ISP technique. The former is lightweight but lacks expressive +power. The latter has expressive power, but the computational cost is too heavy +on edge devices. To solve these problems, we propose "DynamicISP," which +consists of multiple classical ISP functions and dynamically controls the +parameters of each frame according to the recognition result of the previous +frame. We show our method successfully controls the parameters of multiple ISP +functions and achieves state-of-the-art accuracy with low computational cost in +single and multi-category object detection tasks. + +
+
+ comment: Accepted to ICCV2023. Several updates from v2 including additional + experiments and modification of typos in Auto Gain equation +
+
+
+
+
+ + ♻ ☆ Ego-Body Pose Estimation via Ego-Head Pose Estimation CVPR 2023 + + +
+ Estimating 3D human motion from an egocentric video sequence plays a critical +role in human behavior understanding and has various applications in VR/AR. +However, naively learning a mapping between egocentric videos and human motions +is challenging, because the user's body is often unobserved by the front-facing +camera placed on the head of the user. In addition, collecting large-scale, +high-quality datasets with paired egocentric videos and 3D human motions +requires accurate motion capture devices, which often limit the variety of +scenes in the videos to lab-like environments. To eliminate the need for paired +egocentric video and human motions, we propose a new method, Ego-Body Pose +Estimation via Ego-Head Pose Estimation (EgoEgo), which decomposes the problem +into two stages, connected by the head motion as an intermediate +representation. EgoEgo first integrates SLAM and a learning approach to +estimate accurate head motion. Subsequently, leveraging the estimated head pose +as input, EgoEgo utilizes conditional diffusion to generate multiple plausible +full-body motions. This disentanglement of head and body pose eliminates the +need for training datasets with paired egocentric videos and 3D human motion, +enabling us to leverage large-scale egocentric video datasets and motion +capture datasets separately. Moreover, for systematic benchmarking, we develop +a synthetic dataset, AMASS-Replica-Ego-Syn (ARES), with paired egocentric +videos and human motion. On both ARES and real data, our EgoEgo model performs +significantly better than the current state-of-the-art methods. + +
+
+ comment: CVPR 2023 (Award Candidate) +
+
+
+
+
+ + ♻ ☆ Exploring the Mutual Influence between Self-Supervised Single-Frame and + Multi-Frame Depth Estimation + + +
+ Although both self-supervised single-frame and multi-frame depth estimation +methods only require unlabeled monocular videos for training, the information +they leverage varies because single-frame methods mainly rely on +appearance-based features while multi-frame methods focus on geometric cues. +Considering the complementary information of single-frame and multi-frame +methods, some works attempt to leverage single-frame depth to improve +multi-frame depth. However, these methods can neither exploit the difference +between single-frame depth and multi-frame depth to improve multi-frame depth +nor leverage multi-frame depth to optimize single-frame depth models. To fully +utilize the mutual influence between single-frame and multi-frame methods, we +propose a novel self-supervised training framework. Specifically, we first +introduce a pixel-wise adaptive depth sampling module guided by single-frame +depth to train the multi-frame model. Then, we leverage the minimum +reprojection based distillation loss to transfer the knowledge from the +multi-frame depth network to the single-frame network to improve single-frame +depth. Finally, we regard the improved single-frame depth as a prior to further +boost the performance of multi-frame depth estimation. Experimental results on +the KITTI and Cityscapes datasets show that our method outperforms existing +approaches in the self-supervised monocular setting. + +
+
+ comment: Accepted for publication in the IEEE Robotics and Automation Letters + (RA-L). 8 pages, 3figures +
+
+
+
+
+ + ♻ ☆ Unsupervised Anomaly Detection in Medical Images Using Masked Diffusion + Model MICCAI 2023 + + +
+ It can be challenging to identify brain MRI anomalies using supervised +deep-learning techniques due to anatomical heterogeneity and the requirement +for pixel-level labeling. Unsupervised anomaly detection approaches provide an +alternative solution by relying only on sample-level labels of healthy brains +to generate a desired representation to identify abnormalities at the pixel +level. Although, generative models are crucial for generating such anatomically +consistent representations of healthy brains, accurately generating the +intricate anatomy of the human brain remains a challenge. In this study, we +present a method called masked-DDPM (mDPPM), which introduces masking-based +regularization to reframe the generation task of diffusion models. +Specifically, we introduce Masked Image Modeling (MIM) and Masked Frequency +Modeling (MFM) in our self-supervised approach that enables models to learn +visual representations from unlabeled data. To the best of our knowledge, this +is the first attempt to apply MFM in DPPM models for medical applications. We +evaluate our approach on datasets containing tumors and numerous sclerosis +lesions and exhibit the superior performance of our unsupervised method as +compared to the existing fully/weakly supervised baselines. Code is available +at https://github.com/hasan1292/mDDPM. + +
+
+ comment: Accepted in MICCAI 2023 Workshops +
+
+
+
+
+ + ♻ ☆ Score-Based Diffusion Models as Principled Priors for Inverse Imaging ICCV 2023 + + +
+ Priors are essential for reconstructing images from noisy and/or incomplete +measurements. The choice of the prior determines both the quality and +uncertainty of recovered images. We propose turning score-based diffusion +models into principled image priors ("score-based priors") for analyzing a +posterior of images given measurements. Previously, probabilistic priors were +limited to handcrafted regularizers and simple distributions. In this work, we +empirically validate the theoretically-proven probability function of a +score-based diffusion model. We show how to sample from resulting posteriors by +using this probability function for variational inference. Our results, +including experiments on denoising, deblurring, and interferometric imaging, +suggest that score-based priors enable principled inference with a +sophisticated, data-driven image prior. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Spatially Varying Nanophotonic Neural Networks + + +
+ The explosive growth of computation and energy cost of artificial +intelligence has spurred strong interests in new computing modalities as +potential alternatives to conventional electronic processors. Photonic +processors that execute operations using photons instead of electrons, have +promised to enable optical neural networks with ultra-low latency and power +consumption. However, existing optical neural networks, limited by the +underlying network designs, have achieved image recognition accuracy much lower +than state-of-the-art electronic neural networks. In this work, we close this +gap by introducing a large-kernel spatially-varying convolutional neural +network learned via low-dimensional reparameterization techniques. We +experimentally instantiate the network with a flat meta-optical system that +encompasses an array of nanophotonic structures designed to induce +angle-dependent responses. Combined with an extremely lightweight electronic +backend with approximately 2K parameters we demonstrate a nanophotonic neural +network reaches 73.80\% blind test classification accuracy on CIFAR-10 dataset, +and, as such, the first time, an optical neural network outperforms the first +modern digital neural network -- AlexNet (72.64\%) with 57M parameters, +bringing optical neural network into modern deep learning era. + +
+
+
+
+
+ + ♻ ☆ Trajectory Poisson multi-Bernoulli mixture filter for traffic monitoring + using a drone + + +
+ This paper proposes a multi-object tracking (MOT) algorithm for traffic +monitoring using a drone equipped with optical and thermal cameras. Object +detections on the images are obtained using a neural network for each type of +camera. The cameras are modelled as direction-of-arrival (DOA) sensors. Each +DOA detection follows a von-Mises Fisher distribution, whose mean direction is +obtain by projecting a vehicle position on the ground to the camera. We then +use the trajectory Poisson multi-Bernoulli mixture filter (TPMBM), which is a +Bayesian MOT algorithm, to optimally estimate the set of vehicle trajectories. +We have also developed a parameter estimation algorithm for the measurement +model. We have tested the accuracy of the resulting TPMBM filter in synthetic +and experimental data sets. + +
+
+ comment: accepted in IEEE Transactions on Vehicular Technology +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ TRIVEA: Transparent Ranking Interpretation using Visual Explanation of + Black-Box Algorithmic Rankers + + +
+ Ranking schemes drive many real-world decisions, like, where to study, whom +to hire, what to buy, etc. Many of these decisions often come with high +consequences. For example, a university can be deemed less prestigious if not +featured in a top-k list, and consumers might not even explore products that do +not get recommended to buyers. At the heart of most of these decisions are +opaque ranking schemes, which dictate the ordering of data entities, but their +internal logic is inaccessible or proprietary. Drawing inferences about the +ranking differences is like a guessing game to the stakeholders, like, the +rankees (i.e., the entities who are ranked, like product companies) and the +decision-makers (i.e., who use the rankings, like buyers). In this paper, we +aim to enable transparency in ranking interpretation by using algorithmic +rankers that learn from available data and by enabling human reasoning about +the learned ranking differences using explainable AI (XAI) methods. To realize +this aim, we leverage the exploration-explanation paradigm of human-data +interaction to let human stakeholders explore subsets and groupings of complex +multi-attribute ranking data using visual explanations of model fit and +attribute influence on rankings. We realize this explanation paradigm for +transparent ranking interpretation in TRIVEA, a visual analytic system that is +fueled by: i) visualizations of model fit derived from algorithmic rankers that +learn the associations between attributes and rankings from available data and +ii) visual explanations derived from XAI methods that help abstract important +patterns, like, the relative influence of attributes in different ranking +ranges. Using TRIVEA, end users not trained in data science have the agency to +transparently reason about the global and local behavior of the rankings +without the need to open black-box ranking models and develop confidence in the +resulting attribute-based inferences. We demonstrate the efficacy of TRIVEA +using multiple usage scenarios and subjective feedback from researchers with +diverse domain expertise. Keywords: Visual Analytics, Learning-to-Rank, +Explainable ML, Ranking + +
+
+ comment: Accepted for publication in SpringerNature's Visual Computer Journal +
+
+
+
+
+ + ☆ Fairness Through Domain Awareness: Mitigating Popularity Bias For Music + Discovery + + +
+ As online music platforms grow, music recommender systems play a vital role +in helping users navigate and discover content within their vast musical +databases. At odds with this larger goal, is the presence of popularity bias, +which causes algorithmic systems to favor mainstream content over, potentially +more relevant, but niche items. In this work we explore the intrinsic +relationship between music discovery and popularity bias. To mitigate this +issue we propose a domain-aware, individual fairness-based approach which +addresses popularity bias in graph neural network (GNNs) based recommender +systems. Our approach uses individual fairness to reflect a ground truth +listening experience, i.e., if two songs sound similar, this similarity should +be reflected in their representations. In doing so, we facilitate meaningful +music discovery that is robust to popularity bias and grounded in the music +domain. We apply our BOOST methodology to two discovery based tasks, performing +recommendations at both the playlist level and user level. Then, we ground our +evaluation in the cold start setting, showing that our approach outperforms +existing fairness benchmarks in both performance and recommendation of +lesser-known content. Finally, our analysis explains why our proposed +methodology is a novel and promising approach to mitigating popularity bias and +improving the discovery of new and niche content in music recommender systems. + +
+
+
+
+
+ + ☆ Efficient and Accurate Tree Detection from 3D Point Clouds through Paid + Crowdsourcing + + +
+ Accurate tree detection is of growing importance in applications such as +urban planning, forest inventory, and environmental monitoring. In this +article, we present an approach to creating tree maps by annotating them in 3D +point clouds. Point cloud representations allow the precise identification of +tree positions, particularly stem locations, and their heights. Our method +leverages human computational power through paid crowdsourcing, employing a web +tool designed to enable even non-experts to effectively tackle the task. The +primary focus of this paper is to discuss the web tool's development and +strategies to ensure high-quality tree annotations despite encountering noise +in the crowdsourced data. Following our methodology, we achieve quality +measures surpassing 90% for various challenging test sets of diverse +complexities. We emphasize that our tree map creation process, including +initial point cloud collection, can be completed within 1-2 days. + +
+
+ comment: This paper can be considered an extension of the approach presented + by Walter et al. + (https://isprs-annals.copernicus.org/articles/V-4-2020/49/2020/) +
+
+
+
+
+ + ☆ Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware + Pre-training for KBQA CIKM 2023 + + +
+ Knowledge Base Question Answering (KBQA) aims to answer natural language +questions with factual information such as entities and relations in KBs. +However, traditional Pre-trained Language Models (PLMs) are directly +pre-trained on large-scale natural language corpus, which poses challenges for +them in understanding and representing complex subgraphs in structured KBs. To +bridge the gap between texts and structured KBs, we propose a Structured +Knowledge-aware Pre-training method (SKP). In the pre-training stage, we +introduce two novel structured knowledge-aware tasks, guiding the model to +effectively learn the implicit relationship and better representations of +complex subgraphs. In downstream KBQA task, we further design an efficient +linearization strategy and an interval attention mechanism, which assist the +model to better encode complex subgraphs and shield the interference of +irrelevant subgraphs during reasoning respectively. Detailed experiments and +analyses on WebQSP verify the effectiveness of SKP, especially the significant +improvement in subgraph retrieval (+4.08% H@10). + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Can Transformer and GNN Help Each Other? + + +
+ Although Transformer has achieved great success in natural language process +and computer vision, it has difficulty generalizing to medium and large-scale +graph data for two important reasons: (i) High complexity. (ii) Failing to +capture the complex and entangled structure information. In graph +representation learning, Graph Neural Networks(GNNs) can fuse the graph +structure and node attributes but have limited receptive fields. Therefore, we +question whether can we combine Transformers and GNNs to help each other. In +this paper, we propose a new model named TransGNN where the Transformer layer +and GNN layer are used alternately to improve each other. Specifically, to +expand the receptive field and disentangle the information aggregation from +edges, we propose using Transformer to aggregate more relevant nodes' +information to improve the message passing of GNNs. Besides, to capture the +graph structure information, we utilize positional encoding and make use of the +GNN layer to fuse the structure into node attributes, which improves the +Transformer in graph data. We also propose to sample the most relevant nodes +for Transformer and two efficient samples update strategies to lower the +complexity. At last, we theoretically prove that TransGNN is more expressive +than GNNs only with extra linear complexity. The experiments on eight datasets +corroborate the effectiveness of TransGNN on node and graph classification +tasks. + +
+
+
+
+
+ + ☆ RecMind: Large Language Model Powered Agent For Recommendation + + +
+ Recent advancements in instructing Large Language Models (LLMs) to utilize +external tools and execute multi-step plans have significantly enhanced their +ability to solve intricate tasks, ranging from mathematical problems to +creative writing. Yet, there remains a notable gap in studying the capacity of +LLMs in responding to personalized queries such as a recommendation request. To +bridge this gap, we have designed an LLM-powered autonomous recommender agent, +RecMind, which is capable of providing precise personalized recommendations +through careful planning, utilizing tools for obtaining external knowledge, and +leveraging individual data. We propose a novel algorithm, Self-Inspiring, to +improve the planning ability of the LLM agent. At each intermediate planning +step, the LLM 'self-inspires' to consider all previously explored states to +plan for next step. This mechanism greatly improves the model's ability to +comprehend and utilize historical planning information for recommendation. We +evaluate RecMind's performance in various recommendation scenarios, including +rating prediction, sequential recommendation, direct recommendation, +explanation generation, and review summarization. Our experiment shows that +RecMind outperforms existing zero/few-shot LLM-based recommendation methods in +different recommendation tasks and achieves competitive performance to a recent +model P5, which requires fully pre-train for the recommendation tasks. + +
+
+
+
+
+ + ☆ Alleviating Video-Length Effect for Micro-video Recommendation + + +
+ Micro-videos platforms such as TikTok are extremely popular nowadays. One +important feature is that users no longer select interested videos from a set, +instead they either watch the recommended video or skip to the next one. As a +result, the time length of users' watching behavior becomes the most important +signal for identifying preferences. However, our empirical data analysis has +shown a video-length effect that long videos are easier to receive a higher +value of average view time, thus adopting such view-time labels for measuring +user preferences can easily induce a biased model that favors the longer +videos. In this paper, we propose a Video Length Debiasing Recommendation +(VLDRec) method to alleviate such an effect for micro-video recommendation. +VLDRec designs the data labeling approach and the sample generation module that +better capture user preferences in a view-time oriented manner. It further +leverages the multi-task learning technique to jointly optimize the above +samples with original biased ones. Extensive experiments show that VLDRec can +improve the users' view time by 1.81% and 11.32% on two real-world datasets, +given a recommendation list of a fixed overall video length, compared with the +best baseline method. Moreover, VLDRec is also more effective in matching +users' interests in terms of the video content. + +
+
+
+
+
+ + ☆ Cross-Modal Retrieval: A Systematic Review of Methods and Future + Directions + + +
+ With the exponential surge in diverse multi-modal data, traditional uni-modal +retrieval methods struggle to meet the needs of users demanding access to data +from various modalities. To address this, cross-modal retrieval has emerged, +enabling interaction across modalities, facilitating semantic matching, and +leveraging complementarity and consistency between different modal data. +Although prior literature undertook a review of the cross-modal retrieval +field, it exhibits numerous deficiencies pertaining to timeliness, taxonomy, +and comprehensiveness. This paper conducts a comprehensive review of +cross-modal retrieval's evolution, spanning from shallow statistical analysis +techniques to vision-language pre-training models. Commencing with a +comprehensive taxonomy grounded in machine learning paradigms, mechanisms, and +models, the paper then delves deeply into the principles and architectures +underpinning existing cross-modal retrieval methods. Furthermore, it offers an +overview of widely used benchmarks, metrics, and performances. Lastly, the +paper probes the prospects and challenges that confront contemporary +cross-modal retrieval, while engaging in a discourse on potential directions +for further progress in the field. To facilitate the research on cross-modal +retrieval, we develop an open-source code repository at +https://github.com/BMC-SDNU/Cross-Modal-Retrieval. + +
+
+
+
+
+ + ☆ RecRec: Algorithmic Recourse for Recommender Systems CIKM 2023 + + +
+ Recommender systems play an essential role in the choices people make in +domains such as entertainment, shopping, food, news, employment, and education. +The machine learning models underlying these recommender systems are often +enormously large and black-box in nature for users, content providers, and +system developers alike. It is often crucial for all stakeholders to understand +the model's rationale behind making certain predictions and recommendations. +This is especially true for the content providers whose livelihoods depend on +the recommender system. Drawing motivation from the practitioners' need, in +this work, we propose a recourse framework for recommender systems, targeted +towards the content providers. Algorithmic recourse in the recommendation +setting is a set of actions that, if executed, would modify the recommendations +(or ranking) of an item in the desired manner. A recourse suggests actions of +the form: "if a feature changes X to Y, then the ranking of that item for a set +of users will change to Z." Furthermore, we demonstrate that RecRec is highly +effective in generating valid, sparse, and actionable recourses through an +empirical evaluation of recommender systems trained on three real-world +datasets. To the best of our knowledge, this work is the first to conceptualize +and empirically test a generalized framework for generating recourses for +recommender systems. + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in + Recommendation Networks + + +
+ Recommendation models are vital in delivering personalized user experiences +by leveraging the correlation between multiple input features. However, deep +learning-based recommendation models often face challenges due to evolving user +behaviour and item features, leading to covariate shifts. Effective +cross-feature learning is crucial to handle data distribution drift and +adapting to changing user behaviour. Traditional feature interaction techniques +have limitations in achieving optimal performance in this context. + This work introduces Ad-Rec, an advanced network that leverages feature +interaction techniques to address covariate shifts. This helps eliminate +irrelevant interactions in recommendation tasks. Ad-Rec leverages masked +transformers to enable the learning of higher-order cross-features while +mitigating the impact of data distribution drift. Our approach improves model +quality, accelerates convergence, and reduces training time, as measured by the +Area Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its +ability to achieve superior model quality through comprehensive ablation +studies. + +
+
+
+
+
+ + ☆ Extending Cross-Modal Retrieval with Interactive Learning to Improve + Image Retrieval Performance in Forensics AAAI22 + + +
+ Nowadays, one of the critical challenges in forensics is analyzing the +enormous amounts of unstructured digital evidence, such as images. Often, +unstructured digital evidence contains precious information for forensic +investigations. Therefore, a retrieval system that can effectively identify +forensically relevant images is paramount. In this work, we explored the +effectiveness of interactive learning in improving image retrieval performance +in the forensic domain by proposing Excalibur - a zero-shot cross-modal image +retrieval system extended with interactive learning. Excalibur was evaluated +using both simulations and a user study. The simulations reveal that +interactive learning is highly effective in improving retrieval performance in +the forensic domain. Furthermore, user study participants could effectively +leverage the power of interactive learning. Finally, they considered Excalibur +effective and straightforward to use and expressed interest in using it in +their daily practice. + +
+
+ comment: Submitted to the AAAI22 conference +
+
+
+
+
+ + ♻ ☆ How Discriminative Are Your Qrels? How To Study the Statistical + Significance of Document Adjudication Methods + + +
+ Creating test collections for offline retrieval evaluation requires human +effort to judge documents' relevance. This expensive activity motivated much +work in developing methods for constructing benchmarks with fewer assessment +costs. In this respect, adjudication methods actively decide both which +documents and the order in which experts review them, in order to better +exploit the assessment budget or to lower it. Researchers evaluate the quality +of those methods by measuring the correlation between the known gold ranking of +systems under the full collection and the observed ranking of systems under the +lower-cost one. This traditional analysis ignores whether and how the low-cost +judgements impact on the statistically significant differences among systems +with respect to the full collection. We fill this void by proposing a novel +methodology to evaluate how the low-cost adjudication methods preserve the +pairwise significant differences between systems as the full collection. In +other terms, while traditional approaches look for stability in answering the +question "is system A better than system B?", our proposed approach looks for +stability in answering the question "is system A significantly better than +system B?", which is the ultimate questions researchers need to answer to +guarantee the generalisability of their results. Among other results, we found +that the best methods in terms of ranking of systems correlation do not always +match those preserving statistical significance. + +
+
+
+
+
+ + ♻ ☆ Leveraging Watch-time Feedback for Short-Video Recommendations: A Causal + Labeling Framework + + +
+ With the proliferation of short video applications, the significance of short +video recommendations has vastly increased. Unlike other recommendation +scenarios, short video recommendation systems heavily rely on feedback from +watch time. Existing approaches simply treat watch time as a direct label, +failing to effectively harness its extensive semantics and introduce bias, +thereby limiting the potential for modeling user interests based on watch time. +To overcome this challenge, we propose a framework named Debiased +Multiple-semantics-extracting Labeling(DML). DML constructs labels that +encompass various semantics by utilizing quantiles derived from the +distribution of watch time, prioritizing relative order rather than absolute +label values. This approach facilitates easier model learning while aligning +with the ranking objective of recommendations. Furthermore, we introduce a +method inspired by causal adjustment to refine label definitions, thereby +directly mitigating bias at the label level. We substantiate the effectiveness +of our DML framework through both online and offline experiments. Extensive +results demonstrate that our DML could effectively leverage watch time to +discover users' real interests, enhancing their engagement in our application. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+
+
+
+ + Machine Learning 145 + +
+
+
+ + ☆ Efficient Discovery and Effective Evaluation of Visual Perceptual + Similarity: A Benchmark and Beyond ICCV 2023 + + +
+ Visual similarities discovery (VSD) is an important task with broad +e-commerce applications. Given an image of a certain object, the goal of VSD is +to retrieve images of different objects with high perceptual visual similarity. +Although being a highly addressed problem, the evaluation of proposed methods +for VSD is often based on a proxy of an identification-retrieval task, +evaluating the ability of a model to retrieve different images of the same +object. We posit that evaluating VSD methods based on identification tasks is +limited, and faithful evaluation must rely on expert annotations. In this +paper, we introduce the first large-scale fashion visual similarity benchmark +dataset, consisting of more than 110K expert-annotated image pairs. Besides +this major contribution, we share insight from the challenges we faced while +curating this dataset. Based on these insights, we propose a novel and +efficient labeling procedure that can be applied to any dataset. Our analysis +examines its limitations and inductive biases, and based on these findings, we +propose metrics to mitigate those limitations. Though our primary focus lies on +visual similarity, the methodologies we present have broader applications for +discovering and evaluating perceptual similarity across various domains. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Minimizing Quasi-Self-Concordant Functions by Gradient Regularization of + Newton Method + + +
+ We study the composite convex optimization problems with a +Quasi-Self-Concordant smooth component. This problem class naturally +interpolates between classic Self-Concordant functions and functions with +Lipschitz continuous Hessian. Previously, the best complexity bounds for this +problem class were associated with trust-region schemes and implementations of +a ball-minimization oracle. In this paper, we show that for minimizing +Quasi-Self-Concordant functions we can use instead the basic Newton Method with +Gradient Regularization. For unconstrained minimization, it only involves a +simple matrix inversion operation (solving a linear system) at each step. We +prove a fast global linear rate for this algorithm, matching the complexity +bound of the trust-region scheme, while our method remains especially simple to +implement. Then, we introduce the Dual Newton Method, and based on it, develop +the corresponding Accelerated Newton Scheme for this problem class, which +further improves the complexity factor of the basic method. As a direct +consequence of our results, we establish fast global linear rates of simple +variants of the Newton Method applied to several practical problems, including +Logistic Regression, Soft Maximum, and Matrix Scaling, without requiring +additional assumptions on strong or uniform convexity for the target objective. + +
+
+
+
+
+ + ☆ Total Selfie: Generating Full-Body Selfies + + +
+ We present a method to generate full-body selfies -- photos that you take of +yourself, but capturing your whole body as if someone else took the photo of +you from a few feet away. Our approach takes as input a pre-captured video of +your body, a target pose photo, and a selfie + background pair for each +location. We introduce a novel diffusion-based approach to combine all of this +information into high quality, well-composed photos of you with the desired +pose and background. + +
+
+ comment: Project page: + https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/ +
+
+
+
+
+ + ☆ Fast Feedforward Networks + + +
+ We break the linear link between the layer size and its inference cost by +introducing the fast feedforward (FFF) architecture, a logarithmic-time +alternative to feedforward networks. + We show that FFFs give comparable performance to feedforward networks at an +exponential fraction of their inference cost, are quicker to deliver +performance compared to mixture-of-expert networks, and can readily take the +place of either in transformers. + Pushing FFFs to the absolute limit, we train a vision transformer to perform +single-neuron inferences at the cost of only 5.8% performance decrease against +the full-width variant. + Our implementation is available as a Python package; just use "pip install +fastfeedforward". + +
+
+ comment: 12 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ VideoCutLER: Surprisingly Simple Unsupervised Video Instance + Segmentation + + +
+ Existing approaches to unsupervised video instance segmentation typically +rely on motion estimates and experience difficulties tracking small or +divergent motions. We present VideoCutLER, a simple method for unsupervised +multi-instance video segmentation without using motion-based learning signals +like optical flow or training on natural videos. Our key insight is that using +high-quality pseudo masks and a simple video synthesis method for model +training is surprisingly sufficient to enable the resulting video model to +effectively segment and track multiple instances across video frames. We show +the first competitive unsupervised learning results on the challenging +YouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous +state-of-the-art by a large margin. VideoCutLER can also serve as a strong +pretrained model for supervised video instance segmentation tasks, exceeding +DINO by 15.9% on YouTubeVIS-2019 in terms of APvideo. + +
+
+ comment: Preprint. Code: https://github.com/facebookresearch/CutLER +
+
+
+
+
+ + ☆ Diversified Ensemble of Independent Sub-Networks for Robust + Self-Supervised Representation Learning + + +
+ Ensembling a neural network is a widely recognized approach to enhance model +performance, estimate uncertainty, and improve robustness in deep supervised +learning. However, deep ensembles often come with high computational costs and +memory demands. In addition, the efficiency of a deep ensemble is related to +diversity among the ensemble members which is challenging for large, +over-parameterized deep neural networks. Moreover, ensemble learning has not +yet seen such widespread adoption, and it remains a challenging endeavor for +self-supervised or unsupervised representation learning. Motivated by these +challenges, we present a novel self-supervised training regime that leverages +an ensemble of independent sub-networks, complemented by a new loss function +designed to encourage diversity. Our method efficiently builds a sub-model +ensemble with high diversity, leading to well-calibrated estimates of model +uncertainty, all achieved with minimal computational overhead compared to +traditional deep self-supervised ensembles. To evaluate the effectiveness of +our approach, we conducted extensive experiments across various tasks, +including in-distribution generalization, out-of-distribution detection, +dataset corruption, and semi-supervised settings. The results demonstrate that +our method significantly improves prediction reliability. Our approach not only +achieves excellent accuracy but also enhances calibration, surpassing baseline +performance across a wide range of self-supervised architectures in computer +vision, natural language processing, and genomics data. + +
+
+
+
+
+ + ☆ Hybrid PLS-ML Authentication Scheme for V2I Communication Networks + + +
+ Vehicular communication networks are rapidly emerging as vehicles become +smarter. However, these networks are increasingly susceptible to various +attacks. The situation is exacerbated by the rise in automated vehicles +complicates, emphasizing the need for security and authentication measures to +ensure safe and effective traffic management. In this paper, we propose a novel +hybrid physical layer security (PLS)-machine learning (ML) authentication +scheme by exploiting the position of the transmitter vehicle as a device +fingerprint. We use a time-of-arrival (ToA) based localization mechanism where +the ToA is estimated at roadside units (RSUs), and the coordinates of the +transmitter vehicle are extracted at the base station (BS).Furthermore, to +track the mobility of the moving legitimate vehicle, we use ML model trained on +several system parameters. We try two ML models for this purpose, i.e., support +vector regression and decision tree. To evaluate our scheme, we conduct binary +hypothesis testing on the estimated positions with the help of the ground +truths provided by the ML model, which classifies the transmitter node as +legitimate or malicious. Moreover, we consider the probability of false alarm +and the probability of missed detection as performance metrics resulting from +the binary hypothesis testing, and mean absolute error (MAE), mean square error +(MSE), and coefficient of determination $\text{R}^2$ to further evaluate the ML +models. We also compare our scheme with a baseline scheme that exploits the +angle of arrival at RSUs for authentication. We observe that our proposed +position-based mechanism outperforms the baseline scheme significantly in terms +of missed detections. + +
+
+ comment: Accepted for Publication following Presentation at IEEE ISNCC-23 +
+
+
+
+
+ + ☆ Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual + Predatory Chats and Abusive Texts + + +
+ Detecting online sexual predatory behaviours and abusive language on social +media platforms has become a critical area of research due to the growing +concerns about online safety, especially for vulnerable populations such as +children and adolescents. Researchers have been exploring various techniques +and approaches to develop effective detection systems that can identify and +mitigate these risks. Recent development of large language models (LLMs) has +opened a new opportunity to address this problem more effectively. This paper +proposes an approach to detection of online sexual predatory chats and abusive +language using the open-source pretrained Llama 2 7B-parameter model, recently +released by Meta GenAI. We fine-tune the LLM using datasets with different +sizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu). +Based on the power of LLMs, our approach is generic and automated without a +manual search for a synergy between feature extraction and classifier design +steps like conventional methods in this domain. Experimental results show a +strong performance of the proposed approach, which performs proficiently and +consistently across three distinct datasets with five sets of experiments. This +study's outcomes indicate that the proposed method can be implemented in +real-world applications (even with non-English languages) for flagging sexual +predators, offensive or toxic content, hate speech, and discriminatory language +in online discussions and comments to maintain respectful internet or digital +communities. Furthermore, it can be employed for solving text classification +problems with other potential applications such as sentiment analysis, spam and +phishing detection, sorting legal documents, fake news detection, language +identification, user intent recognition, text-based product categorization, +medical record analysis, and resume screening. + +
+
+
+
+
+ + ☆ RESTORE: Graph Embedding Assessment Through Reconstruction + + +
+ Following the success of Word2Vec embeddings, graph embeddings (GEs) have +gained substantial traction. GEs are commonly generated and evaluated +extrinsically on downstream applications, but intrinsic evaluations of the +original graph properties in terms of topological structure and semantic +information have been lacking. Understanding these will help identify the +deficiency of the various families of GE methods when vectorizing graphs in +terms of preserving the relevant knowledge or learning incorrect knowledge. To +address this, we propose RESTORE, a framework for intrinsic GEs assessment +through graph reconstruction. We show that reconstructing the original graph +from the underlying GEs yields insights into the relative amount of information +preserved in a given vector form. We first introduce the graph reconstruction +task. We generate GEs from three GE families based on factorization methods, +random walks, and deep learning (with representative algorithms from each +family) on the CommonSense Knowledge Graph (CSKG). We analyze their +effectiveness in preserving the (a) topological structure of node-level graph +reconstruction with an increasing number of hops and (b) semantic information +on various word semantic and analogy tests. Our evaluations show deep +learning-based GE algorithm (SDNE) is overall better at preserving (a) with a +mean average precision (mAP) of 0.54 and 0.35 for 2 and 3-hop reconstruction +respectively, while the factorization-based algorithm (HOPE) is better at +encapsulating (b) with an average Euclidean distance of 0.14, 0.17, and 0.11 +for 1, 2, and 3-hop reconstruction respectively. The modest performance of +these GEs leaves room for further research avenues on better graph +representation learning. + +
+
+
+
+
+ + ☆ Adversarial Predictions of Data Distributions Across Federated + Internet-of-Things Devices + + +
+ Federated learning (FL) is increasingly becoming the default approach for +training machine learning models across decentralized Internet-of-Things (IoT) +devices. A key advantage of FL is that no raw data are communicated across the +network, providing an immediate layer of privacy. Despite this, recent works +have demonstrated that data reconstruction can be done with the locally trained +model updates which are communicated across the network. However, many of these +works have limitations with regard to how the gradients are computed in +backpropagation. In this work, we demonstrate that the model weights shared in +FL can expose revealing information about the local data distributions of IoT +devices. This leakage could expose sensitive information to malicious actors in +a distributed system. We further discuss results which show that injecting +noise into model weights is ineffective at preventing data leakage without +seriously harming the global model accuracy. + +
+
+ comment: 6 pages, 6 figures, accepted for publication through 2023 IEEE World + Forum on Internet of Things +
+
+
+
+
+ + ☆ Comparison of automated crater catalogs for Mars from Benedix et al. + (2020) and Lee and Hogan (2021) + + +
+ Crater mapping using neural networks and other automated methods has +increased recently with automated Crater Detection Algorithms (CDAs) applied to +planetary bodies throughout the solar system. A recent publication by Benedix +et al. (2020) showed high performance at small scales compared to similar +automated CDAs but with a net positive diameter bias in many crater candidates. +I compare the publicly available catalogs from Benedix et al. (2020) and Lee & +Hogan (2021) and show that the reported performance is sensitive to the metrics +used to test the catalogs. I show how the more permissive comparison methods +indicate a higher CDA performance by allowing worse candidate craters to match +ground-truth craters. I show that the Benedix et al. (2020) catalog has a +substantial performance loss with increasing latitude and identify an image +projection issue that might cause this loss. Finally, I suggest future +applications of neural networks in generating large scientific datasets be +validated using secondary networks with independent data sources or training +methods. + +
+
+ comment: 14 pages, 6 figures. Accepted August 13th 2023 +
+
+
+
+
+ + ☆ Edge Generation Scheduling for DAG Tasks using Deep Reinforcement + Learning + + +
+ Directed acyclic graph (DAG) tasks are currently adopted in the real-time +domain to model complex applications from the automotive, avionics, and +industrial domain that implement their functionalities through chains of +intercommunicating tasks. This paper studies the problem of scheduling +real-time DAG tasks by presenting a novel schedulability test based on the +concept of trivial schedulability. Using this schedulability test, we propose a +new DAG scheduling framework (edge generation scheduling -- EGS) that attempts +to minimize the DAG width by iteratively generating edges while guaranteeing +the deadline constraint. We study how to efficiently solve the problem of +generating edges by developing a deep reinforcement learning algorithm combined +with a graph representation neural network to learn an efficient edge +generation policy for EGS. We evaluate the effectiveness of the proposed +algorithm by comparing it with state-of-the-art DAG scheduling heuristics and +an optimal mixed-integer linear programming baseline. Experimental results show +that the proposed algorithm outperforms the state-of-the-art by requiring fewer +processors to schedule the same DAG tasks. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Human Comfortability Index Estimation in Industrial Human-Robot + Collaboration Task + + +
+ Fluent human-robot collaboration requires a robot teammate to understand, +learn, and adapt to the human's psycho-physiological state. Such collaborations +require a computing system that monitors human physiological signals during +human-robot collaboration (HRC) to quantitatively estimate a human's level of +comfort, which we have termed in this research as comfortability index (CI) and +uncomfortability index (unCI). Subjective metrics (surprise, anxiety, boredom, +calmness, and comfortability) and physiological signals were collected during a +human-robot collaboration experiment that varied robot behavior. The emotion +circumplex model is adapted to calculate the CI from the participant's +quantitative data as well as physiological data. To estimate CI/unCI from +physiological signals, time features were extracted from electrocardiogram +(ECG), galvanic skin response (GSR), and pupillometry signals. In this +research, we successfully adapt the circumplex model to find the location +(axis) of 'comfortability' and 'uncomfortability' on the circumplex model, and +its location match with the closest emotions on the circumplex model. Finally, +the study showed that the proposed approach can estimate human +comfortability/uncomfortability from physiological signals. + +
+
+ comment: Submitted to IEEE-THMS +
+
+
+
+
+ + ☆ Rate-Optimal Policy Optimization for Linear Markov Decision Processes + + +
+ We study regret minimization in online episodic linear Markov Decision +Processes, and obtain rate-optimal $\widetilde O (\sqrt K)$ regret where $K$ +denotes the number of episodes. Our work is the first to establish the optimal +(w.r.t.~$K$) rate of convergence in the stochastic setting with bandit feedback +using a policy optimization based approach, and the first to establish the +optimal (w.r.t.~$K$) rate in the adversarial setup with full information +feedback, for which no algorithm with an optimal rate guarantee is currently +known. + +
+
+
+
+
+ + ☆ Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance IJCAI-2023 + + +
+ We propose the use of conversational GPT models for easy and quick few-shot +text classification in the financial domain using the Banking77 dataset. Our +approach involves in-context learning with GPT-3.5 and GPT-4, which minimizes +the technical expertise required and eliminates the need for expensive GPU +computing while yielding quick and accurate results. Additionally, we fine-tune +other pre-trained, masked language models with SetFit, a recent contrastive +learning technique, to achieve state-of-the-art results both in full-data and +few-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can +outperform fine-tuned, non-generative models even with fewer examples. However, +subscription fees associated with these solutions may be considered costly for +small organizations. Lastly, we find that generative models perform better on +the given task when shown representative samples selected by a human expert +rather than when shown random ones. We conclude that a) our proposed methods +offer a practical solution for few-shot tasks in datasets with limited label +availability, and b) our state-of-the-art results can inspire future work in +the area. + +
+
+ comment: Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023 +
+
+
+
+
+ + ☆ Comparing AutoML and Deep Learning Methods for Condition Monitoring + using Realistic Validation Scenarios + + +
+ This study extensively compares conventional machine learning methods and +deep learning for condition monitoring tasks using an AutoML toolbox. The +experiments reveal consistent high accuracy in random K-fold cross-validation +scenarios across all tested models. However, when employing leave-one-group-out +(LOGO) cross-validation on the same datasets, no clear winner emerges, +indicating the presence of domain shift in real-world scenarios. Additionally, +the study assesses the scalability and interpretability of conventional methods +and neural networks. Conventional methods offer explainability with their +modular structure aiding feature identification. In contrast, neural networks +require specialized interpretation techniques like occlusion maps to visualize +important regions in the input data. Finally, the paper highlights the +significance of feature selection, particularly in condition monitoring tasks +with limited class variations. Low-complexity models prove sufficient for such +tasks, as only a few features from the input signal are typically needed. In +summary, these findings offer crucial insights into the strengths and +limitations of various approaches, providing valuable benchmarks and +identifying the most suitable methods for condition monitoring applications, +thereby enhancing their applicability in real-world scenarios. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ VesselShot: Few-shot learning for cerebral blood vessel segmentation + + +
+ Angiography is widely used to detect, diagnose, and treat cerebrovascular +diseases. While numerous techniques have been proposed to segment the vascular +network from different imaging modalities, deep learning (DL) has emerged as a +promising approach. However, existing DL methods often depend on proprietary +datasets and extensive manual annotation. Moreover, the availability of +pre-trained networks specifically for medical domains and 3D volumes is +limited. To overcome these challenges, we propose a few-shot learning approach +called VesselShot for cerebrovascular segmentation. VesselShot leverages +knowledge from a few annotated support images and mitigates the scarcity of +labeled data and the need for extensive annotation in cerebral blood vessel +segmentation. We evaluated the performance of VesselShot using the publicly +available TubeTK dataset for the segmentation task, achieving a mean Dice +coefficient (DC) of 0.62(0.03). + +
+
+
+
+
+ + ☆ AI in the Gray: Exploring Moderation Policies in Dialogic Large Language + Models vs. Human Answers in Controversial Topics + + +
+ The introduction of ChatGPT and the subsequent improvement of Large Language +Models (LLMs) have prompted more and more individuals to turn to the use of +ChatBots, both for information and assistance with decision-making. However, +the information the user is after is often not formulated by these ChatBots +objectively enough to be provided with a definite, globally accepted answer. + Controversial topics, such as "religion", "gender identity", "freedom of +speech", and "equality", among others, can be a source of conflict as partisan +or biased answers can reinforce preconceived notions or promote disinformation. +By exposing ChatGPT to such debatable questions, we aim to understand its level +of awareness and if existing models are subject to socio-political and/or +economic biases. We also aim to explore how AI-generated answers compare to +human ones. For exploring this, we use a dataset of a social media platform +created for the purpose of debating human-generated claims on polemic subjects +among users, dubbed Kialo. + Our results show that while previous versions of ChatGPT have had important +issues with controversial topics, more recent versions of ChatGPT +(gpt-3.5-turbo) are no longer manifesting significant explicit biases in +several knowledge areas. In particular, it is well-moderated regarding economic +aspects. However, it still maintains degrees of implicit libertarian leaning +toward right-winged ideals which suggest the need for increased moderation from +the socio-political point of view. In terms of domain knowledge on +controversial topics, with the exception of the "Philosophical" category, +ChatGPT is performing well in keeping up with the collective human level of +knowledge. Finally, we see that sources of Bing AI have slightly more tendency +to the center when compared to human answers. All the analyses we make are +generalizable to other types of biases and domains. + +
+
+
+
+
+ + ☆ On the Tradeoff between Privacy Preservation and Byzantine-Robustness in + Decentralized Learning + + +
+ This paper jointly considers privacy preservation and Byzantine-robustness in +decentralized learning. In a decentralized network, honest-but-curious agents +faithfully follow the prescribed algorithm, but expect to infer their +neighbors' private data from messages received during the learning process, +while dishonest-and-Byzantine agents disobey the prescribed algorithm, and +deliberately disseminate wrong messages to their neighbors so as to bias the +learning process. For this novel setting, we investigate a generic +privacy-preserving and Byzantine-robust decentralized stochastic gradient +descent (SGD) framework, in which Gaussian noise is injected to preserve +privacy and robust aggregation rules are adopted to counteract Byzantine +attacks. We analyze its learning error and privacy guarantee, discovering an +essential tradeoff between privacy preservation and Byzantine-robustness in +decentralized learning -- the learning error caused by defending against +Byzantine attacks is exacerbated by the Gaussian noise added to preserve +privacy. Numerical experiments are conducted and corroborate our theoretical +findings. + +
+
+
+
+
+ + ☆ Recent Progress in Energy Management of Connected Hybrid Electric + Vehicles Using Reinforcement Learning + + +
+ The growing adoption of hybrid electric vehicles (HEVs) presents a +transformative opportunity for revolutionizing transportation energy systems. +The shift towards electrifying transportation aims to curb environmental +concerns related to fossil fuel consumption. This necessitates efficient energy +management systems (EMS) to optimize energy efficiency. The evolution of EMS +from HEVs to connected hybrid electric vehicles (CHEVs) represent a pivotal +shift. For HEVs, EMS now confronts the intricate energy cooperation +requirements of CHEVs, necessitating advanced algorithms for route +optimization, charging coordination, and load distribution. Challenges persist +in both domains, including optimal energy utilization for HEVs, and cooperative +eco-driving control (CED) for CHEVs across diverse vehicle types. Reinforcement +learning (RL) stands out as a promising tool for addressing these challenges at +hand. Specifically, within the realm of CHEVs, the application of multi-agent +reinforcement learning (MARL) emerges as a powerful approach for effectively +tackling the intricacies of CED control. Despite extensive research, few +reviews span from individual vehicles to multi-vehicle scenarios. This review +bridges the gap, highlighting challenges, advancements, and potential +contributions of RL-based solutions for future sustainable transportation +systems. + +
+
+
+
+
+ + ☆ Fairness Through Domain Awareness: Mitigating Popularity Bias For Music + Discovery + + +
+ As online music platforms grow, music recommender systems play a vital role +in helping users navigate and discover content within their vast musical +databases. At odds with this larger goal, is the presence of popularity bias, +which causes algorithmic systems to favor mainstream content over, potentially +more relevant, but niche items. In this work we explore the intrinsic +relationship between music discovery and popularity bias. To mitigate this +issue we propose a domain-aware, individual fairness-based approach which +addresses popularity bias in graph neural network (GNNs) based recommender +systems. Our approach uses individual fairness to reflect a ground truth +listening experience, i.e., if two songs sound similar, this similarity should +be reflected in their representations. In doing so, we facilitate meaningful +music discovery that is robust to popularity bias and grounded in the music +domain. We apply our BOOST methodology to two discovery based tasks, performing +recommendations at both the playlist level and user level. Then, we ground our +evaluation in the cold start setting, showing that our approach outperforms +existing fairness benchmarks in both performance and recommendation of +lesser-known content. Finally, our analysis explains why our proposed +methodology is a novel and promising approach to mitigating popularity bias and +improving the discovery of new and niche content in music recommender systems. + +
+
+
+
+
+ + ☆ Adversarial Attacks on Foundational Vision Models + + +
+ Rapid progress is being made in developing large, pretrained, task-agnostic +foundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are +approaching the point where these models do not have to be finetuned +downstream, and can simply be used in zero-shot or with a lightweight probing +head. Critically, given the complexity of working at this scale, there is a +bottleneck where relatively few organizations in the world are executing the +training then sharing the models on centralized platforms such as HuggingFace +and torch.hub. The goal of this work is to identify several key adversarial +vulnerabilities of these models in an effort to make future designs more +robust. Intuitively, our attacks manipulate deep feature representations to +fool an out-of-distribution (OOD) detector which will be required when using +these open-world-aware models to solve closed-set downstream tasks. Our methods +reliably make in-distribution (ID) images (w.r.t. a downstream task) be +predicted as OOD and vice versa while existing in extremely +low-knowledge-assumption threat models. We show our attacks to be potent in +whitebox and blackbox settings, as well as when transferred across foundational +model types (e.g., attack DINOv2 with CLIP)! This work is only just the +beginning of a long journey towards adversarially robust foundational vision +models. + +
+
+
+
+
+ + ☆ LatentDR: Improving Model Generalization Through Sample-Aware Latent + Degradation and Restoration + + +
+ Despite significant advances in deep learning, models often struggle to +generalize well to new, unseen domains, especially when training data is +limited. To address this challenge, we propose a novel approach for +distribution-aware latent augmentation that leverages the relationships across +samples to guide the augmentation procedure. Our approach first degrades the +samples stochastically in the latent space, mapping them to augmented labels, +and then restores the samples from their corrupted versions during training. +This process confuses the classifier in the degradation step and restores the +overall class distribution of the original samples, promoting diverse +intra-class/cross-domain variability. We extensively evaluate our approach on a +diverse set of datasets and tasks, including domain generalization benchmarks +and medical imaging datasets with strong domain shift, where we show our +approach achieves significant improvements over existing methods for latent +space augmentation. We further show that our method can be flexibly adapted to +long-tail recognition tasks, demonstrating its versatility in building more +generalizable models. Code is available at +https://github.com/nerdslab/LatentDR. + +
+
+
+
+
+ + ☆ Neural Network Training Strategy to Enhance Anomaly Detection + Performance: A Perspective on Reconstruction Loss Amplification + + +
+ Unsupervised anomaly detection (UAD) is a widely adopted approach in industry +due to rare anomaly occurrences and data imbalance. A desirable characteristic +of an UAD model is contained generalization ability which excels in the +reconstruction of seen normal patterns but struggles with unseen anomalies. +Recent studies have pursued to contain the generalization capability of their +UAD models in reconstruction from different perspectives, such as design of +neural network (NN) structure and training strategy. In contrast, we note that +containing of generalization ability in reconstruction can also be obtained +simply from steep-shaped loss landscape. Motivated by this, we propose a loss +landscape sharpening method by amplifying the reconstruction loss, dubbed Loss +AMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the +reconstruction error on unseen anomalies becomes greater. Accordingly, the +anomaly detection performance is improved without any change of the NN +architecture. Our findings suggest that LAMP can be easily applied to any +reconstruction error metrics in UAD settings where the reconstruction model is +trained with anomaly-free samples only. + +
+
+ comment: 5 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Kernel Limit of Recurrent Neural Networks Trained on Ergodic Data + Sequences + + +
+ Mathematical methods are developed to characterize the asymptotics of +recurrent neural networks (RNN) as the number of hidden units, data samples in +the sequence, hidden state updates, and training steps simultaneously grow to +infinity. In the case of an RNN with a simplified weight matrix, we prove the +convergence of the RNN to the solution of an infinite-dimensional ODE coupled +with the fixed point of a random algebraic equation. The analysis requires +addressing several challenges which are unique to RNNs. In typical mean-field +applications (e.g., feedforward neural networks), discrete updates are of +magnitude $\mathcal{O}(\frac{1}{N})$ and the number of updates is +$\mathcal{O}(N)$. Therefore, the system can be represented as an Euler +approximation of an appropriate ODE/PDE, which it will converge to as $N +\rightarrow \infty$. However, the RNN hidden layer updates are +$\mathcal{O}(1)$. Therefore, RNNs cannot be represented as a discretization of +an ODE/PDE and standard mean-field techniques cannot be applied. Instead, we +develop a fixed point analysis for the evolution of the RNN memory states, with +convergence estimates in terms of the number of update steps and the number of +hidden units. The RNN hidden layer is studied as a function in a Sobolev space, +whose evolution is governed by the data sequence (a Markov chain), the +parameter updates, and its dependence on the RNN hidden layer at the previous +time step. Due to the strong correlation between updates, a Poisson equation +must be used to bound the fluctuations of the RNN around its limit equation. +These mathematical methods give rise to the neural tangent kernel (NTK) limits +for RNNs trained on data sequences as the number of data samples and size of +the neural network grow to infinity. + +
+
+
+
+
+ + ☆ Spoken Language Intelligence of Large Language Models for Language + Learning + + +
+ People have long hoped for a conversational system that can assist in +real-life situations, and recent progress on large language models (LLMs) is +bringing this idea closer to reality. While LLMs are often impressive in +performance, their efficacy in real-world scenarios that demand expert +knowledge remains unclear. LLMs are believed to hold the most potential and +value in education, especially in the development of Artificial intelligence +(AI) based virtual teachers capable of facilitating language learning. Our +focus is centered on evaluating the efficacy of LLMs in the realm of education, +specifically in the areas of spoken language learning which encompass +phonetics, phonology, and second language acquisition. We introduce a new +multiple-choice question dataset to evaluate the effectiveness of LLMs in the +aforementioned scenarios, including understanding and application of spoken +language knowledge. In addition, we investigate the influence of various +prompting techniques such as zero- and few-shot method (prepending the question +with question-answer exemplars), chain-of-thought (CoT, think step-by-step), +in-domain exampler and external tools (Google, Wikipedia). We conducted +large-scale evaluation on popular LLMs (20 distinct models) using these +methods. We achieved significant performance improvements compared to the +zero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% -> +63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different +sizes have good understanding of concepts in phonetics, phonology, and second +language acquisition, but show limitations in reasoning for real-world +problems. Additionally, we also explore preliminary findings on conversational +communication. + +
+
+ comment: 28 pages, 7 figures, Preprint +
+
+
+
+
+ + ☆ Large Graph Models: A Perspective + + +
+ Large models have emerged as the most recent groundbreaking achievements in +artificial intelligence, and particularly machine learning. However, when it +comes to graphs, large models have not achieved the same level of success as in +other fields, such as natural language processing and computer vision. In order +to promote applying large models for graphs forward, we present a perspective +paper to discuss the challenges and opportunities associated with developing +large graph models. First, we discuss the desired characteristics of large +graph models. Then, we present detailed discussions from three key +perspectives: representation basis, graph data, and graph models. In each +category, we provide a brief overview of recent advances and highlight the +remaining challenges together with our visions. Finally, we discuss valuable +applications of large graph models. We believe this perspective paper is able +to encourage further investigations into large graph models, ultimately pushing +us one step closer towards artificial general intelligence (AGI). + +
+
+ comment: Preliminary version. Comments are welcome +
+
+
+
+
+ + ☆ Context-Aware Composition of Agent Policies by Markov Decision Process + Entity Embeddings and Agent Ensembles + + +
+ Computational agents support humans in many areas of life and are therefore +found in heterogeneous contexts. This means that agents operate in rapidly +changing environments and can be confronted with huge state and action spaces. +In order to perform services and carry out activities in a goal-oriented +manner, agents require prior knowledge and therefore have to develop and pursue +context-dependent policies. The problem is that prescribing policies in advance +is limited and inflexible, especially in dynamically changing environments. +Moreover, the context of an agent determines its choice of actions. Since the +environments in which agents operate can be stochastic and complex in terms of +the number of states and feasible actions, activities are usually modelled in a +simplified way by Markov decision processes so that agents with reinforcement +learning are able to learn policies that help to capture the context and act +accordingly to optimally perform activities. However, training policies for all +possible contexts using reinforcement learning is time-consuming. A requirement +and challenge for agents is to learn strategies quickly and respond immediately +in cross-context environments and applications. In this work, we propose a +novel simulation-based approach that enables a) the representation of +heterogeneous contexts through knowledge graphs and entity embeddings and b) +the context-aware composition of policies on demand by ensembles of agents +running in parallel. The evaluation we performed on the "Virtual Home" dataset +indicates that agents that need to seamlessly switch between different +contexts, can request on-the-fly composed policies that lead to the successful +completion of context-appropriate activities without having to learn these +policies in lengthy training steps and episodes, in contrast to agents that +apply reinforcement learning. + +
+
+ comment: 29 pages, 11 figures, 9 tables, 3 listings, Submitted to Semantic Web + Journal, Under revision for re-submission to Semantic Web Journal +
+
+
+
+
+ + ☆ Prediction of Tourism Flow with Sparse Geolocation Data SC2023 + + +
+ Modern tourism in the 21st century is facing numerous challenges. Among these +the rapidly growing number of tourists visiting space-limited regions like +historical cities, museums and bottlenecks such as bridges is one of the +biggest. In this context, a proper and accurate prediction of tourism volume +and tourism flow within a certain area is important and critical for visitor +management tasks such as sustainable treatment of the environment and +prevention of overcrowding. Static flow control methods like conventional +low-level controllers or limiting access to overcrowded venues could not solve +the problem yet. In this paper, we empirically evaluate the performance of +state-of-the-art deep-learning methods such as RNNs, GNNs, and Transformers as +well as the classic statistical ARIMA method. Granular limited data supplied by +a tourism region is extended by exogenous data such as geolocation trajectories +of individual tourists, weather and holidays. In the field of visitor flow +prediction with sparse data, we are thereby capable of increasing the accuracy +of our predictions, incorporating modern input feature handling as well as +mapping geolocation data on top of discrete POI data. + +
+
+ comment: Accepted for publication at the proceedings of the 5th International + Data Science Conference - iDSC2023. arXiv admin note: substantial text + overlap with arXiv:2206.13274 +
+
+
+
+
+ + ☆ Spectral Estimators for Structured Generalized Linear Models via + Approximate Message Passing + + +
+ We consider the problem of parameter estimation from observations given by a +generalized linear model. Spectral methods are a simple yet effective approach +for estimation: they estimate the parameter via the principal eigenvector of a +matrix obtained by suitably preprocessing the observations. Despite their wide +use, a rigorous performance characterization of spectral estimators, as well as +a principled way to preprocess the data, is available only for unstructured +(i.e., i.i.d. Gaussian and Haar) designs. In contrast, real-world design +matrices are highly structured and exhibit non-trivial correlations. To address +this problem, we consider correlated Gaussian designs which capture the +anisotropic nature of the measurements via a feature covariance matrix +$\Sigma$. Our main result is a precise asymptotic characterization of the +performance of spectral estimators in this setting. This then allows to +identify the optimal preprocessing that minimizes the number of samples needed +to meaningfully estimate the parameter. Remarkably, such an optimal spectral +estimator depends on $\Sigma$ only through its normalized trace, which can be +consistently estimated from the data. Numerical results demonstrate the +advantage of our principled approach over previous heuristic methods. + Existing analyses of spectral estimators crucially rely on the rotational +invariance of the design matrix. This key assumption does not hold for +correlated Gaussian designs. To circumvent this difficulty, we develop a novel +strategy based on designing and analyzing an approximate message passing +algorithm whose fixed point coincides with the desired spectral estimator. Our +methodology is general, and opens the way to the precise characterization of +spiked matrices and of the corresponding spectral methods in a variety of +settings. + +
+
+
+
+
+ + ☆ Rebalancing Social Feed to Minimize Polarization and Disagreement CIKM 2023 + + +
+ Social media have great potential for enabling public discourse on important +societal issues. However, adverse effects, such as polarization and echo +chambers, greatly impact the benefits of social media and call for algorithms +that mitigate these effects. In this paper, we propose a novel problem +formulation aimed at slightly nudging users' social feeds in order to strike a +balance between relevance and diversity, thus mitigating the emergence of +polarization, without lowering the quality of the feed. Our approach is based +on re-weighting the relative importance of the accounts that a user follows, so +as to calibrate the frequency with which the content produced by various +accounts is shown to the user. We analyze the convexity properties of the +problem, demonstrating the non-matrix convexity of the objective function and +the convexity of the feasible set. To efficiently address the problem, we +develop a scalable algorithm based on projected gradient descent. We also prove +that our problem statement is a proper generalization of the undirected-case +problem so that our method can also be adopted for undirected social networks. +As a baseline for comparison in the undirected case, we develop a semidefinite +programming approach, which provides the optimal solution. Through extensive +experiments on synthetic and real-world datasets, we validate the effectiveness +of our approach, which outperforms non-trivial baselines, underscoring its +ability to foster healthier and more cohesive online communities. + +
+
+ comment: Accepted for publication at ACM CIKM 2023 +
+
+
+
+
+ + ☆ Group Regression for Query Based Object Detection and Tracking SC 2023 + + +
+ Group regression is commonly used in 3D object detection to predict box +parameters of similar classes in a joint head, aiming to benefit from +similarities while separating highly dissimilar classes. For query-based +perception methods, this has, so far, not been feasible. We close this gap and +present a method to incorporate multi-class group regression, especially +designed for the 3D domain in the context of autonomous driving, into existing +attention and query-based perception approaches. We enhance a transformer based +joint object detection and tracking model with this approach, and thoroughly +evaluate its behavior and performance. For group regression, the classes of the +nuScenes dataset are divided into six groups of similar shape and prevalence, +each being regressed by a dedicated head. We show that the proposed method is +applicable to many existing transformer based perception approaches and can +bring potential benefits. The behavior of query group regression is thoroughly +analyzed in comparison to a unified regression head, e.g. in terms of +class-switching behavior and distribution of the output parameters. The +proposed method offers many possibilities for further research, such as in the +direction of deep multi-hypotheses tracking. + +
+
+ comment: Accepted for publication at the 2023 26th IEEE International + Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28, + 2023, in Bilbao, Spain +
+
+
+
+
+ + ☆ Some issues in robust clustering + + +
+ Some key issues in robust clustering are discussed with focus on Gaussian +mixture model based clustering, namely the formal definition of outliers, +ambiguity between groups of outliers and clusters, the interaction between +robust clustering and the estimation of the number of clusters, the essential +dependence of (not only) robust clustering on tuning decisions, and +shortcomings of existing measurements of cluster stability when it comes to +outliers. + +
+
+ comment: 11 pages, no figures +
+
+
+
+
+ + ☆ Speech Self-Supervised Representations Benchmarking: a Case for Larger + Probing Heads + + +
+ Self-supervised learning (SSL) leverages large datasets of unlabeled speech +to reach impressive performance with reduced amounts of annotated data. The +high number of proposed approaches fostered the emergence of comprehensive +benchmarks that evaluate their performance on a set of downstream tasks +exploring various aspects of the speech signal. However, while the number of +considered tasks has been growing, most proposals rely upon a single downstream +architecture that maps the frozen SSL representations to the task labels. This +study examines how benchmarking results are affected by changes in the probing +head architecture. Interestingly, we found that altering the downstream +architecture structure leads to significant fluctuations in the performance +ranking of the evaluated models. Against common practices in speech SSL +benchmarking, we evaluate larger-capacity probing heads, showing their impact +on performance, inference costs, generalization and multi-level feature +exploitation. + +
+
+ comment: 11 Pages +
+
+
+
+
+ + ☆ TextrolSpeech: A Text Style Control Speech Corpus With Codec Language + Text-to-Speech Models + + +
+ Recently, there has been a growing interest in the field of controllable +Text-to-Speech (TTS). While previous studies have relied on users providing +specific style factor values based on acoustic knowledge or selecting reference +speeches that meet certain requirements, generating speech solely from natural +text prompts has emerged as a new challenge for researchers. This challenge +arises due to the scarcity of high-quality speech datasets with natural text +style prompt and the absence of advanced text-controllable TTS models. In light +of this, 1) we propose TextrolSpeech, which is the first large-scale speech +emotion dataset annotated with rich text attributes. The dataset comprises +236,220 pairs of style prompt in natural text descriptions with five style +factors and corresponding speech samples. Through iterative experimentation, we +introduce a multi-stage prompt programming approach that effectively utilizes +the GPT model for generating natural style descriptions in large volumes. 2) +Furthermore, to address the need for generating audio with greater style +diversity, we propose an efficient architecture called Salle. This architecture +treats text controllable TTS as a language model task, utilizing audio codec +codes as an intermediate representation to replace the conventional +mel-spectrogram. Finally, we successfully demonstrate the ability of the +proposed model by showing a comparable performance in the controllable TTS +task. Audio samples are available at https://sall-e.github.io/ + +
+
+
+
+
+ + ☆ Shielded Reinforcement Learning for Hybrid Systems + + +
+ Safe and optimal controller synthesis for switched-controlled hybrid systems, +which combine differential equations and discrete changes of the system's +state, is known to be intricately hard. Reinforcement learning has been +leveraged to construct near-optimal controllers, but their behavior is not +guaranteed to be safe, even when it is encouraged by reward engineering. One +way of imposing safety to a learned controller is to use a shield, which is +correct by design. However, obtaining a shield for non-linear and hybrid +environments is itself intractable. In this paper, we propose the construction +of a shield using the so-called barbaric method, where an approximate finite +representation of an underlying partition-based two-player safety game is +extracted via systematically picked samples of the true transition function. +While hard safety guarantees are out of reach, we experimentally demonstrate +strong statistical safety guarantees with a prototype implementation and UPPAAL +STRATEGO. Furthermore, we study the impact of the synthesized shield when +applied as either a pre-shield (applied before learning a controller) or a +post-shield (only applied after learning a controller). We experimentally +demonstrate superiority of the pre-shielding approach. We apply our technique +on a range of case studies, including two industrial examples, and further +study post-optimization of the post-shielding approach. + +
+
+
+
+
+ + ☆ Task-Aware Machine Unlearning and Its Application in Load Forecasting + + +
+ Data privacy and security have become a non-negligible factor in load +forecasting. Previous researches mainly focus on training stage enhancement. +However, once the model is trained and deployed, it may need to `forget' (i.e., +remove the impact of) part of training data if the data is found to be +malicious or as requested by the data owner. This paper introduces machine +unlearning algorithm which is specifically designed to remove the influence of +part of the original dataset on an already trained forecaster. However, direct +unlearning inevitably degrades the model generalization ability. To balance +between unlearning completeness and performance degradation, a +performance-aware algorithm is proposed by evaluating the sensitivity of local +model parameter change using influence function and sample re-weighting. +Moreover, we observe that the statistic criterion cannot fully reflect the +operation cost of down-stream tasks. Therefore, a task-aware machine unlearning +is proposed whose objective is a tri-level optimization with dispatch and +redispatch problems considered. We theoretically prove the existence of the +gradient of such objective, which is key to re-weighting the remaining samples. +We test the unlearning algorithms on linear and neural network load forecasters +with realistic load dataset. The simulation demonstrates the balance on +unlearning completeness and operational cost. All codes can be found at +https://github.com/xuwkk/task_aware_machine_unlearning. + +
+
+
+
+
+ + ☆ Steerable Conditional Diffusion for Out-of-Distribution Adaptation in + Imaging Inverse Problems + + +
+ Denoising diffusion models have emerged as the go-to framework for solving +inverse problems in imaging. A critical concern regarding these models is their +performance on out-of-distribution (OOD) tasks, which remains an under-explored +challenge. Realistic reconstructions inconsistent with the measured data can be +generated, hallucinating image features that are uniquely present in the +training dataset. To simultaneously enforce data-consistency and leverage +data-driven priors, we introduce a novel sampling framework called Steerable +Conditional Diffusion. This framework adapts the denoising network specifically +to the available measured data. Utilising our proposed method, we achieve +substantial enhancements in OOD performance across diverse imaging modalities, +advancing the robust deployment of denoising diffusion models in real-world +applications. + +
+
+
+
+
+ + ☆ Identifying topology of leaky photonic lattices with machine learning + + +
+ We show how machine learning techniques can be applied for the classification +of topological phases in leaky photonic lattices using limited measurement +data. We propose an approach based solely on bulk intensity measurements, thus +exempt from the need for complicated phase retrieval procedures. In particular, +we design a fully connected neural network that accurately determines +topological properties from the output intensity distribution in dimerized +waveguide arrays with leaky channels, after propagation of a spatially +localized initial excitation at a finite distance, in a setting that closely +emulates realistic experimental conditions. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer + and NearFarMix Augmentation WACV 2024 + + +
+ In computer vision, depth estimation is crucial for domains like robotics, +autonomous vehicles, augmented reality, and virtual reality. Integrating +semantics with depth enhances scene understanding through reciprocal +information sharing. However, the scarcity of semantic information in datasets +poses challenges. Existing convolutional approaches with limited local +receptive fields hinder the full utilization of the symbiotic potential between +depth and semantics. This paper introduces a dataset-invariant semi-supervised +strategy to address the scarcity of semantic information. It proposes the Depth +Semantics Symbiosis module, leveraging the Symbiotic Transformer for achieving +comprehensive mutual awareness by information exchange within both local and +global contexts. Additionally, a novel augmentation, NearFarMix is introduced +to combat overfitting and compensate both depth-semantic tasks by strategically +merging regions from two images, generating diverse and structurally consistent +samples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI +datasets demonstrate the superiority of our proposed techniques in indoor and +outdoor environments. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ Biclustering Methods via Sparse Penalty + + +
+ In this paper, we first reviewed several biclustering methods that are used +to identify the most significant clusters in gene expression data. Here we +mainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty +named "Prenet penalty" which has been used only in factor analysis to gain +sparsity. Then in the simulation study, we tried different types of generated +datasets (with different sparsity and dimension) and tried 1-layer +approximation then for k-layers which shows the mixed Prenet penalty is very +effective for non-overlapped data. Finally, we used some real gene expression +data to show the behavior of our methods. + +
+
+
+
+
+ + ☆ Self-Supervision for Tackling Unsupervised Anomaly Detection: Pitfalls + and Opportunities + + +
+ Self-supervised learning (SSL) is a growing torrent that has recently +transformed machine learning and its many real world applications, by learning +on massive amounts of unlabeled data via self-generated supervisory signals. +Unsupervised anomaly detection (AD) has also capitalized on SSL, by +self-generating pseudo-anomalies through various data augmentation functions or +external data exposure. In this vision paper, we first underline the importance +of the choice of SSL strategies on AD performance, by presenting evidences and +studies from the AD literature. Equipped with the understanding that SSL incurs +various hyperparameters (HPs) to carefully tune, we present recent developments +on unsupervised model selection and augmentation tuning for SSL-based AD. We +then highlight emerging challenges and future opportunities; on designing new +pretext tasks and augmentation functions for different data modalities, +creating novel model selection solutions for systematically tuning the SSL HPs, +as well as on capitalizing on the potential of pretrained foundation models on +AD through effective density estimation. + +
+
+
+
+
+ + ☆ Meta Attentive Graph Convolutional Recurrent Network for Traffic + Forecasting + + +
+ Traffic forecasting is a fundamental problem in intelligent transportation +systems. Existing traffic predictors are limited by their expressive power to +model the complex spatial-temporal dependencies in traffic data, mainly due to +the following limitations. Firstly, most approaches are primarily designed to +model the local shared patterns, which makes them insufficient to capture the +specific patterns associated with each node globally. Hence, they fail to learn +each node's unique properties and diversified patterns. Secondly, most existing +approaches struggle to accurately model both short- and long-term dependencies +simultaneously. In this paper, we propose a novel traffic predictor, named Meta +Attentive Graph Convolutional Recurrent Network (MAGCRN). MAGCRN utilizes a +Graph Convolutional Recurrent Network (GCRN) as a core module to model local +dependencies and improves its operation with two novel modules: 1) a +Node-Specific Meta Pattern Learning (NMPL) module to capture node-specific +patterns globally and 2) a Node Attention Weight Generation Module (NAWG) +module to capture short- and long-term dependencies by connecting the +node-specific features with the ones learned initially at each time step during +GCRN operation. Experiments on six real-world traffic datasets demonstrate that +NMPL and NAWG together enable MAGCRN to outperform state-of-the-art baselines +on both short- and long-term predictions. + +
+
+
+
+
+ + ☆ Are Existing Out-Of-Distribution Techniques Suitable for Network + Intrusion Detection? + + +
+ Machine learning (ML) has become increasingly popular in network intrusion +detection. However, ML-based solutions always respond regardless of whether the +input data reflects known patterns, a common issue across safety-critical +applications. While several proposals exist for detecting Out-Of-Distribution +(OOD) in other fields, it remains unclear whether these approaches can +effectively identify new forms of intrusions for network security. New attacks, +not necessarily affecting overall distributions, are not guaranteed to be +clearly OOD as instead, images depicting new classes are in computer vision. In +this work, we investigate whether existing OOD detectors from other fields +allow the identification of unknown malicious traffic. We also explore whether +more discriminative and semantically richer embedding spaces within models, +such as those created with contrastive learning and multi-class tasks, benefit +detection. Our investigation covers a set of six OOD techniques that employ +different detection strategies. These techniques are applied to models trained +in various ways and subsequently exposed to unknown malicious traffic from the +same and different datasets (network environments). Our findings suggest that +existing detectors can identify a consistent portion of new malicious traffic, +and that improved embedding spaces enhance detection. We also demonstrate that +simple combinations of certain detectors can identify almost 100% of malicious +traffic in our tested scenarios. + +
+
+
+
+
+ + ☆ Online Continual Learning on Hierarchical Label Expansion ICCV 2023 + + +
+ Continual learning (CL) enables models to adapt to new tasks and environments +without forgetting previously learned knowledge. While current CL setups have +ignored the relationship between labels in the past task and the new task with +or without small task overlaps, real-world scenarios often involve hierarchical +relationships between old and new tasks, posing another challenge for +traditional CL approaches. To address this challenge, we propose a novel +multi-level hierarchical class incremental task configuration with an online +learning constraint, called hierarchical label expansion (HLE). Our +configuration allows a network to first learn coarse-grained classes, with data +labels continually expanding to more fine-grained classes in various hierarchy +depths. To tackle this new setup, we propose a rehearsal-based method that +utilizes hierarchy-aware pseudo-labeling to incorporate hierarchical class +information. Additionally, we propose a simple yet effective memory management +and sampling strategy that selectively adopts samples of newly encountered +classes. Our experiments demonstrate that our proposed method can effectively +use hierarchy on our HLE setup to improve classification accuracy across all +levels of hierarchies, regardless of depth and class imbalance ratio, +outperforming prior state-of-the-art works by significant margins while also +outperforming them on the conventional disjoint, blurry and i-Blurry CL setups. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Target-independent XLA optimization using Reinforcement Learning NeurIPS 2022 + + +
+ An important challenge in Machine Learning compilers like XLA is multi-pass +optimization and analysis. There has been recent interest chiefly in XLA +target-dependent optimization on the graph-level, subgraph-level, and +kernel-level phases. We specifically focus on target-independent optimization +XLA HLO pass ordering: our approach aims at finding the optimal sequence of +compiler optimization passes, which is decoupled from target-dependent +optimization. However, there is little domain specific study in pass ordering +for XLA HLO. To this end, we propose introducing deep Reinforcement Learning +(RL) based search for optimal XLA HLO pass ordering. We also propose +enhancements to the deep RL algorithms to further improve optimal search +performance and open the research direction for domain-specific guidance for +RL. We create an XLA Gym experimentation framework as a tool to enable RL +algorithms to interact with the compiler for passing optimizations and thereby +train agents. Overall, in our experimentation we observe an average of $13.3\%$ +improvement in operation count reduction on a benchmark of GPT-2 training +graphs and $10.4\%$ improvement on a diverse benchmark including GPT-2, BERT, +and ResNet graphs using the proposed approach over the compiler's default phase +ordering. + +
+
+ comment: Workshop on ML for Systems @ NeurIPS 2022 +
+
+
+
+
+ + ☆ Can Transformer and GNN Help Each Other? + + +
+ Although Transformer has achieved great success in natural language process +and computer vision, it has difficulty generalizing to medium and large-scale +graph data for two important reasons: (i) High complexity. (ii) Failing to +capture the complex and entangled structure information. In graph +representation learning, Graph Neural Networks(GNNs) can fuse the graph +structure and node attributes but have limited receptive fields. Therefore, we +question whether can we combine Transformers and GNNs to help each other. In +this paper, we propose a new model named TransGNN where the Transformer layer +and GNN layer are used alternately to improve each other. Specifically, to +expand the receptive field and disentangle the information aggregation from +edges, we propose using Transformer to aggregate more relevant nodes' +information to improve the message passing of GNNs. Besides, to capture the +graph structure information, we utilize positional encoding and make use of the +GNN layer to fuse the structure into node attributes, which improves the +Transformer in graph data. We also propose to sample the most relevant nodes +for Transformer and two efficient samples update strategies to lower the +complexity. At last, we theoretically prove that TransGNN is more expressive +than GNNs only with extra linear complexity. The experiments on eight datasets +corroborate the effectiveness of TransGNN on node and graph classification +tasks. + +
+
+
+
+
+ + ☆ EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models + + +
+ Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a +revolution in machine intelligence, owing to their exceptional capabilities in +a wide range of machine learning tasks. However, the transition of LLMs from +data centers to edge devices presents a set of challenges and opportunities. +While this shift can enhance privacy and availability, it is hampered by the +enormous parameter sizes of these models, leading to impractical runtime costs. +In light of these considerations, we introduce EdgeMoE, the first on-device +inference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant +of sparse LLMs that exhibit nearly constant computational complexity as their +parameter size scales. EdgeMoE achieves both memory and computational +efficiency by strategically partitioning the model across the storage +hierarchy. Specifically, non-expert weights are stored in the device's memory, +while expert weights are kept in external storage and are fetched into memory +only when they are activated. This design is underpinned by a crucial insight +that expert weights, though voluminous, are infrequently accessed due to sparse +activation patterns. To further mitigate the overhead associated with expert +I/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise +bitwidth adaptation: This method reduces the size of expert weights with an +acceptable level of accuracy loss. (2) Expert management: It predicts the +experts that will be activated in advance and preloads them into the +compute-I/O pipeline, thus further optimizing the process. In empirical +evaluations conducted on well-established MoE LLMs and various edge devices, +EdgeMoE demonstrates substantial memory savings and performance improvements +when compared to competitive baseline solutions. + +
+
+
+
+
+ + ☆ Simple Modification of the Upper Confidence Bound Algorithm by + Generalized Weighted Averages + + +
+ The multi-armed bandit (MAB) problem is a classical problem that models +sequential decision-making under uncertainty in reinforcement learning. In this +study, we propose a new generalized upper confidence bound (UCB) algorithm +(GWA-UCB1) by extending UCB1, which is a representative algorithm for MAB +problems, using generalized weighted averages, and present an effective +algorithm for various problem settings. GWA-UCB1 is a two-parameter +generalization of the balance between exploration and exploitation in UCB1 and +can be implemented with a simple modification of the UCB1 formula. Therefore, +this algorithm can be easily applied to UCB-based reinforcement learning +models. In preliminary experiments, we investigated the optimal parameters of a +simple generalized UCB1 (G-UCB1), prepared for comparison and GWA-UCB1, in a +stochastic MAB problem with two arms. Subsequently, we confirmed the +performance of the algorithms with the investigated parameters on stochastic +MAB problems when arm reward probabilities were sampled from uniform or normal +distributions and on survival MAB problems assuming more realistic situations. +GWA-UCB1 outperformed G-UCB1, UCB1-Tuned, and Thompson sampling in most problem +settings and can be useful in many situations. The code is available at +https://github.com/manome/python-mab. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Label-free Deep Learning Driven Secure Access Selection in + Space-Air-Ground Integrated Networks + + +
+ In Space-air-ground integrated networks (SAGIN), the inherent openness and +extensive broadcast coverage expose these networks to significant eavesdropping +threats. Considering the inherent co-channel interference due to spectrum +sharing among multi-tier access networks in SAGIN, it can be leveraged to +assist the physical layer security among heterogeneous transmissions. However, +it is challenging to conduct a secrecy-oriented access strategy due to both +heterogeneous resources and different eavesdropping models. In this paper, we +explore secure access selection for a scenario involving multi-mode users +capable of accessing satellites, unmanned aerial vehicles, or base stations in +the presence of eavesdroppers. Particularly, we propose a Q-network +approximation based deep learning approach for selecting the optimal access +strategy for maximizing the sum secrecy rate. Meanwhile, the power optimization +is also carried out by an unsupervised learning approach to improve the secrecy +performance. Remarkably, two neural networks are trained by unsupervised +learning and Q-network approximation which are both label-free methods without +knowing the optimal solution as labels. Numerical results verify the efficiency +of our proposed power optimization approach and access strategy, leading to +enhanced secure transmission performance. + +
+
+
+
+
+ + ☆ Buy when? Survival machine learning model comparison for purchase timing + + +
+ The value of raw data is unlocked by converting it into information and +knowledge that drives decision-making. Machine Learning (ML) algorithms are +capable of analysing large datasets and making accurate predictions. Market +segmentation, client lifetime value, and marketing techniques have all made use +of machine learning. This article examines marketing machine learning +techniques such as Support Vector Machines, Genetic Algorithms, Deep Learning, +and K-Means. ML is used to analyse consumer behaviour, propose items, and make +other customer choices about whether or not to purchase a product or service, +but it is seldom used to predict when a person will buy a product or a basket +of products. In this paper, the survival models Kernel SVM, DeepSurv, Survival +Random Forest, and MTLR are examined to predict tine-purchase individual +decisions. Gender, Income, Location, PurchaseHistory, OnlineBehavior, +Interests, PromotionsDiscounts and CustomerExperience all have an influence on +purchasing time, according to the analysis. The study shows that the DeepSurv +model predicted purchase completion the best. These insights assist marketers +in increasing conversion rates. + +
+
+
+
+
+ + ☆ HRGCN: Heterogeneous Graph-level Anomaly Detection with Hierarchical + Relation-augmented Graph Neural Networks + + +
+ This work considers the problem of heterogeneous graph-level anomaly +detection. Heterogeneous graphs are commonly used to represent behaviours +between different types of entities in complex industrial systems for capturing +as much information about the system operations as possible. Detecting +anomalous heterogeneous graphs from a large set of system behaviour graphs is +crucial for many real-world applications like online web/mobile service and +cloud access control. To address the problem, we propose HRGCN, an unsupervised +deep heterogeneous graph neural network, to model complex heterogeneous +relations between different entities in the system for effectively identifying +these anomalous behaviour graphs. HRGCN trains a hierarchical +relation-augmented Heterogeneous Graph Neural Network (HetGNN), which learns +better graph representations by modelling the interactions among all the system +entities and considering both source-to-destination entity (node) types and +their relation (edge) types. Extensive evaluation on two real-world application +datasets shows that HRGCN outperforms state-of-the-art competing anomaly +detection approaches. We further present a real-world industrial case study to +justify the effectiveness of HRGCN in detecting anomalous (e.g., congested) +network devices in a mobile communication service. HRGCN is available at +https://github.com/jiaxililearn/HRGCN. + +
+
+ comment: 12 pages, 10 figures, 6 tables. Accepted +
+
+
+
+
+ + ☆ Fair Few-shot Learning with Auxiliary Sets ECAI 2023 + + +
+ Recently, there has been a growing interest in developing machine learning +(ML) models that can promote fairness, i.e., eliminating biased predictions +towards certain populations (e.g., individuals from a specific demographic +group). Most existing works learn such models based on well-designed fairness +constraints in optimization. Nevertheless, in many practical ML tasks, only +very few labeled data samples can be collected, which can lead to inferior +fairness performance. This is because existing fairness constraints are +designed to restrict the prediction disparity among different sensitive groups, +but with few samples, it becomes difficult to accurately measure the disparity, +thus rendering ineffective fairness optimization. In this paper, we define the +fairness-aware learning task with limited training samples as the \emph{fair +few-shot learning} problem. To deal with this problem, we devise a novel +framework that accumulates fairness-aware knowledge across different +meta-training tasks and then generalizes the learned knowledge to meta-test +tasks. To compensate for insufficient training samples, we propose an essential +strategy to select and leverage an auxiliary set for each meta-test task. These +auxiliary sets contain several labeled training samples that can enhance the +model performance regarding fairness in meta-test tasks, thereby allowing for +the transfer of learned useful fairness-oriented knowledge to meta-test tasks. +Furthermore, we conduct extensive experiments on three real-world datasets to +validate the superiority of our framework against the state-of-the-art +baselines. + +
+
+ comment: ECAI 2023 +
+
+
+
+
+ + ☆ DiffSmooth: Certifiably Robust Learning via Diffusion Models and Local + Smoothing USENIX Security + + +
+ Diffusion models have been leveraged to perform adversarial purification and +thus provide both empirical and certified robustness for a standard model. On +the other hand, different robustly trained smoothed models have been studied to +improve the certified robustness. Thus, it raises a natural question: Can +diffusion model be used to achieve improved certified robustness on those +robustly trained smoothed models? In this work, we first theoretically show +that recovered instances by diffusion models are in the bounded neighborhood of +the original instance with high probability; and the "one-shot" denoising +diffusion probabilistic models (DDPM) can approximate the mean of the generated +distribution of a continuous-time diffusion model, which approximates the +original instance under mild conditions. Inspired by our analysis, we propose a +certifiably robust pipeline DiffSmooth, which first performs adversarial +purification via diffusion models and then maps the purified instances to a +common region via a simple yet effective local smoothing strategy. We conduct +extensive experiments on different datasets and show that DiffSmooth achieves +SOTA-certified robustness compared with eight baselines. For instance, +DiffSmooth improves the SOTA-certified accuracy from $36.0\%$ to $53.0\%$ under +$\ell_2$ radius $1.5$ on ImageNet. The code is available at +[https://github.com/javyduck/DiffSmooth]. + +
+
+ comment: Accepted in 32nd USENIX Security, 2023 +
+
+
+
+
+ + ☆ Reinforcement Learning for Generative AI: A Survey + + +
+ Deep Generative AI has been a long-standing essential topic in the machine +learning community, which can impact a number of application areas like text +generation and computer vision. The major paradigm to train a generative model +is maximum likelihood estimation, which pushes the learner to capture and +approximate the target data distribution by decreasing the divergence between +the model distribution and the target distribution. This formulation +successfully establishes the objective of generative tasks, while it is +incapable of satisfying all the requirements that a user might expect from a +generative model. Reinforcement learning, serving as a competitive option to +inject new training signals by creating new objectives that exploit novel +signals, has demonstrated its power and flexibility to incorporate human +inductive bias from multiple angles, such as adversarial learning, +hand-designed rules and learned reward model to build a performant model. +Thereby, reinforcement learning has become a trending research field and has +stretched the limits of generative AI in both model design and application. It +is reasonable to summarize and conclude advances in recent years with a +comprehensive review. Although there are surveys in different application areas +recently, this survey aims to shed light on a high-level review that spans a +range of application areas. We provide a rigorous taxonomy in this area and +make sufficient coverage on various models and applications. Notably, we also +surveyed the fast-developing large language model area. We conclude this survey +by showing the potential directions that might tackle the limit of current +models and expand the frontiers for generative AI. + +
+
+
+
+
+ + ☆ Machine Unlearning Methodology base on Stochastic Teacher Network + + +
+ The rise of the phenomenon of the "right to be forgotten" has prompted +research on machine unlearning, which grants data owners the right to actively +withdraw data that has been used for model training, and requires the +elimination of the contribution of that data to the model. A simple method to +achieve this is to use the remaining data to retrain the model, but this is not +acceptable for other data owners who continue to participate in training. +Existing machine unlearning methods have been found to be ineffective in +quickly removing knowledge from deep learning models. This paper proposes using +a stochastic network as a teacher to expedite the mitigation of the influence +caused by forgotten data on the model. We performed experiments on three +datasets, and the findings demonstrate that our approach can efficiently +mitigate the influence of target data on the model within a single epoch. This +allows for one-time erasure and reconstruction of the model, and the +reconstruction model achieves the same performance as the retrained model. + +
+
+ comment: Accepted by 19th International Conference on Advanced Data Mining and + Applications. (ADMA 2023) +
+
+
+
+
+ + ☆ Policy Diversity for Cooperative Agents + + +
+ Standard cooperative multi-agent reinforcement learning (MARL) methods aim to +find the optimal team cooperative policy to complete a task. However there may +exist multiple different ways of cooperating, which usually are very needed by +domain experts. Therefore, identifying a set of significantly different +policies can alleviate the task complexity for them. Unfortunately, there is a +general lack of effective policy diversity approaches specifically designed for +the multi-agent domain. In this work, we propose a method called +Moment-Matching Policy Diversity to alleviate this problem. This method can +generate different team policies to varying degrees by formalizing the +difference between team policies as the difference in actions of selected +agents in different policies. Theoretically, we show that our method is a +simple way to implement a constrained optimization problem that regularizes the +difference between two trajectory distributions by using the maximum mean +discrepancy. The effectiveness of our approach is demonstrated on a challenging +team-based shooter. + +
+
+
+
+
+ + ☆ Solving Attention Kernel Regression Problem via Pre-conditioner + + +
+ Large language models have shown impressive performance in many tasks. One of +the major features from the computation perspective is computing the attention +matrix. Previous works [Zandieh, Han, Daliri, and Karba 2023, Alman and Song +2023] have formally studied the possibility and impossibility of approximating +the attention matrix. In this work, we define and study a new problem which is +called the attention kernel regression problem. We show how to solve the +attention kernel regression in the input sparsity time of the data matrix. + +
+
+
+
+
+ + ☆ Traffic Light Control with Reinforcement Learning + + +
+ Traffic light control is important for reducing congestion in urban mobility +systems. This paper proposes a real-time traffic light control method using +deep Q learning. Our approach incorporates a reward function considering queue +lengths, delays, travel time, and throughput. The model dynamically decides +phase changes based on current traffic conditions. The training of the deep Q +network involves an offline stage from pre-generated data with fixed schedules +and an online stage using real-time traffic data. A deep Q network structure +with a "phase gate" component is used to simplify the model's learning task +under different phases. A "memory palace" mechanism is used to address sample +imbalance during the training process. We validate our approach using both +synthetic and real-world traffic flow data on a road intersecting in Hangzhou, +China. Results demonstrate significant performance improvements of the proposed +method in reducing vehicle waiting time (57.1% to 100%), queue lengths (40.9% +to 100%), and total travel time (16.8% to 68.0%) compared to traditional fixed +signal plans. + +
+
+
+
+
+ + ☆ Goodhart's Law Applies to NLP's Explanation Benchmarks + + +
+ Despite the rising popularity of saliency-based explanations, the research +community remains at an impasse, facing doubts concerning their purpose, +efficacy, and tendency to contradict each other. Seeking to unite the +community's efforts around common goals, several recent works have proposed +evaluation metrics. In this paper, we critically examine two sets of metrics: +the ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics, +focusing our inquiry on natural language processing. First, we show that we can +inflate a model's comprehensiveness and sufficiency scores dramatically without +altering its predictions or explanations on in-distribution test inputs. Our +strategy exploits the tendency for extracted explanations and their complements +to be "out-of-support" relative to each other and in-distribution inputs. Next, +we demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple +method that encodes the label, even though EVAL-X is precisely motivated to +address such exploits. Our results raise doubts about the ability of current +metrics to guide explainability research, underscoring the need for a broader +reassessment of what precisely these metrics are intended to capture. + +
+
+
+
+
+ + ☆ Unleash Model Potential: Bootstrapped Meta Self-supervised Learning NIPS + + +
+ The long-term goal of machine learning is to learn general visual +representations from a small amount of data without supervision, mimicking +three advantages of human cognition: i) no need for labels, ii) robustness to +data scarcity, and iii) learning from experience. Self-supervised learning and +meta-learning are two promising techniques to achieve this goal, but they both +only partially capture the advantages and fail to address all the problems. +Self-supervised learning struggles to overcome the drawbacks of data scarcity, +while ignoring prior knowledge that can facilitate learning and generalization. +Meta-learning relies on supervised information and suffers from a bottleneck of +insufficient learning. To address these issues, we propose a novel Bootstrapped +Meta Self-Supervised Learning (BMSSL) framework that aims to simulate the human +learning process. We first analyze the close relationship between meta-learning +and self-supervised learning. Based on this insight, we reconstruct tasks to +leverage the strengths of both paradigms, achieving advantages i and ii. +Moreover, we employ a bi-level optimization framework that alternates between +solving specific tasks with a learned ability (first level) and improving this +ability (second level), attaining advantage iii. To fully harness its power, we +introduce a bootstrapped target based on meta-gradient to make the model its +own teacher. We validate the effectiveness of our approach with comprehensive +theoretical and empirical study. + +
+
+ comment: submitted to NIPS +
+
+
+
+
+ + ☆ Breaking Boundaries: Distributed Domain Decomposition with Scalable + Physics-Informed Neural PDE Solvers + + +
+ Mosaic Flow is a novel domain decomposition method designed to scale +physics-informed neural PDE solvers to large domains. Its unique approach +leverages pre-trained networks on small domains to solve partial differential +equations on large domains purely through inference, resulting in high +reusability. This paper presents an end-to-end parallelization of Mosaic Flow, +combining data parallel training and domain parallelism for inference on +large-scale problems. By optimizing the network architecture and data parallel +training, we significantly reduce the training time for learning the Laplacian +operator to minutes on 32 GPUs. Moreover, our distributed domain decomposition +algorithm enables scalable inferences for solving the Laplace equation on +domains 4096 times larger than the training domain, demonstrating strong +scaling while maintaining accuracy on 32 GPUs. The reusability of Mosaic Flow, +combined with the improved performance achieved through the distributed-memory +algorithms, makes it a promising tool for modeling complex physical phenomena +and accelerating scientific discovery. + +
+
+
+
+
+ + ☆ The Promise and Peril of Artificial Intelligence -- Violet Teaming + Offers a Balanced Path Forward + + +
+ Artificial intelligence (AI) promises immense benefits across sectors, yet +also poses risks from dual-use potentials, biases, and unintended behaviors. +This paper reviews emerging issues with opaque and uncontrollable AI systems +and proposes an integrative framework called violet teaming to develop reliable +and responsible AI. Violet teaming combines adversarial vulnerability probing +(red teaming) with solutions for safety and security (blue teaming) while +prioritizing ethics and social benefit. It emerged from AI safety research to +manage risks proactively by design. The paper traces the evolution of red, +blue, and purple teaming toward violet teaming, and then discusses applying +violet techniques to address biosecurity risks of AI in biotechnology. +Additional sections review key perspectives across law, ethics, cybersecurity, +macrostrategy, and industry best practices essential for operationalizing +responsible AI through holistic technical and social considerations. Violet +teaming provides both philosophy and method for steering AI trajectories toward +societal good. With conscience and wisdom, the extraordinary capabilities of AI +can enrich humanity. But without adequate precaution, the risks could prove +catastrophic. Violet teaming aims to empower moral technology for the common +welfare. + +
+
+ comment: 14 pages, 1 figure +
+
+
+
+
+ + ☆ Rule-Based Error Detection and Correction to Operationalize Movement + Trajectory Classification + + +
+ Classification of movement trajectories has many applications in +transportation. Supervised neural models represent the current +state-of-the-art. Recent security applications require this task to be rapidly +employed in environments that may differ from the data used to train such +models for which there is little training data. We provide a neuro-symbolic +rule-based framework to conduct error correction and detection of these models +to support eventual deployment in security applications. We provide a suite of +experiments on several recent and state-of-the-art models and show an accuracy +improvement of 1.7% over the SOTA model in the case where all classes are +present in training and when 40% of classes are omitted from training, we +obtain a 5.2% improvement (zero-shot) and 23.9% (few-shot) improvement over the +SOTA model without resorting to retraining of the base model. + +
+
+
+
+
+ + ☆ A Comparison of Personalized and Generalized Approaches to Emotion + Recognition Using Consumer Wearable Devices: Machine Learning Study + + +
+ Background: Studies have shown the potential adverse health effects, ranging +from headaches to cardiovascular disease, associated with long-term negative +emotions and chronic stress. Since many indicators of stress are imperceptible +to observers, the early detection and intervention of stress remains a pressing +medical need. Physiological signals offer a non-invasive method of monitoring +emotions and are easily collected by smartwatches. Existing research primarily +focuses on developing generalized machine learning-based models for emotion +classification. Objective: We aim to study the differences between personalized +and generalized machine learning models for three-class emotion classification +(neutral, stress, and amusement) using wearable biosignal data. Methods: We +developed a convolutional encoder for the three-class emotion classification +problem using data from WESAD, a multimodal dataset with physiological signals +for 15 subjects. We compared the results between a subject-exclusive +generalized, subject-inclusive generalized, and personalized model. Results: +For the three-class classification problem, our personalized model achieved an +average accuracy of 95.06% and F1-score of 91.71, our subject-inclusive +generalized model achieved an average accuracy of 66.95% and F1-score of 42.50, +and our subject-exclusive generalized model achieved an average accuracy of +67.65% and F1-score of 43.05. Conclusions: Our results emphasize the need for +increased research in personalized emotion recognition models given that they +outperform generalized models in certain contexts. We also demonstrate that +personalized machine learning models for emotion classification are viable and +can achieve high performance. + +
+
+
+
+
+ + ☆ Quantum Next Generation Reservoir Computing: An Efficient Quantum + Algorithm for Forecasting Quantum Dynamics + + +
+ Next Generation Reservoir Computing (NG-RC) is a modern class of model-free +machine learning that enables an accurate forecasting of time series data +generated by dynamical systems. We demonstrate that NG-RC can accurately +predict full many-body quantum dynamics, instead of merely concentrating on the +dynamics of observables, which is the conventional application of reservoir +computing. In addition, we apply a technique which we refer to as skipping +ahead to predict far future states accurately without the need to extract +information about the intermediate states. However, adopting a classical NG-RC +for many-body quantum dynamics prediction is computationally prohibitive due to +the large Hilbert space of sample input data. In this work, we propose an +end-to-end quantum algorithm for many-body quantum dynamics forecasting with a +quantum computational speedup via the block-encoding technique. This proposal +presents an efficient model-free quantum scheme to forecast quantum dynamics +coherently, bypassing inductive biases incurred in a model-based approach. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ Reinforcement Learning for Sampling on Temporal Medical Imaging + Sequences ICML 2023 + + +
+ Accelerated magnetic resonance imaging resorts to either Fourier-domain +subsampling or better reconstruction algorithms to deal with fewer measurements +while still generating medical images of high quality. Determining the optimal +sampling strategy given a fixed reconstruction protocol often has combinatorial +complexity. In this work, we apply double deep Q-learning and REINFORCE +algorithms to learn the sampling strategy for dynamic image reconstruction. We +consider the data in the format of time series, and the reconstruction method +is a pre-trained autoencoder-typed neural network. We present a proof of +concept that reinforcement learning algorithms are effective to discover the +optimal sampling pattern which underlies the pre-trained reconstructor network +(i.e., the dynamics in the environment). The code for replicating experiments +can be found at https://github.com/zhishenhuang/RLsamp. + +
+
+ comment: ICML 2023 Workshop SODS +
+
+
+
+
+ + ☆ Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals + + +
+ We consider the problem of sampling from a distribution governed by a +potential function. This work proposes an explicit score-based MCMC method that +is deterministic, resulting in a deterministic evolution for particles rather +than a stochastic differential equation evolution. The score term is given in +closed form by a regularized Wasserstein proximal, using a kernel convolution +that is approximated by sampling. We demonstrate fast convergence on various +problems and show improved dimensional dependence of mixing time bounds for the +case of Gaussian distributions compared to the unadjusted Langevin algorithm +(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally +derive closed form expressions for the distributions at each iterate for +quadratic potential functions, characterizing the variance reduction. Empirical +results demonstrate that the particles behave in an organized manner, lying on +level set contours of the potential. Moreover, the posterior mean estimator of +the proposed method is shown to be closer to the maximum a-posteriori estimator +compared to ULA and MALA, in the context of Bayesian logistic regression. + +
+
+
+
+
+ + ☆ Entropy-based Guidance of Deep Neural Networks for Accelerated + Convergence and Improved Performance + + +
+ Neural networks have dramatically increased our capacity to learn from large, +high-dimensional datasets across innumerable disciplines. However, their +decisions are not easily interpretable, their computational costs are high, and +building and training them are uncertain processes. To add structure to these +efforts, we derive new mathematical results to efficiently measure the changes +in entropy as fully-connected and convolutional neural networks process data, +and introduce entropy-based loss terms. Experiments in image compression and +image classification on benchmark datasets demonstrate these losses guide +neural networks to learn rich latent data representations in fewer dimensions, +converge in fewer training epochs, and achieve better test metrics. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Application of Quantum Pre-Processing Filter for Binary Image + Classification with Small Samples + + +
+ Over the past few years, there has been significant interest in Quantum +Machine Learning (QML) among researchers, as it has the potential to transform +the field of machine learning. Several models that exploit the properties of +quantum mechanics have been developed for practical applications. In this +study, we investigated the application of our previously proposed quantum +pre-processing filter (QPF) to binary image classification. We evaluated the +QPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits +and alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic +sign images). Similar to our previous multi-class classification results, the +application of QPF improved the binary image classification accuracy using +neural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8% +to 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from +93.5% to 92.0%. We then applied QPF in cases using a smaller number of training +and testing samples, i.e. 80 and 20 samples per class, respectively. In order +to derive statistically stable results, we conducted the experiment with 100 +trials choosing randomly different training and testing samples and averaging +the results. The result showed that the application of QPF did not improve the +image classification accuracy against MNIST and EMNIST but improved it against +CIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively. +Further research will be conducted as part of future work to investigate the +potential of QPF to assess the scalability of the proposed approach to larger +and complex datasets. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Maestro: Uncovering Low-Rank Structures via Trainable Decomposition + + +
+ Deep Neural Networks (DNNs) have been a large driver and enabler for AI +breakthroughs in recent years. These models have been getting larger in their +attempt to become more accurate and tackle new upcoming use-cases, including +AR/VR and intelligent assistants. However, the training process of such large +models is a costly and time-consuming process, which typically yields a single +model to fit all targets. To mitigate this, various techniques have been +proposed in the literature, including pruning, sparsification or quantization +of the model weights and updates. While able to achieve high compression rates, +they often incur computational overheads or accuracy penalties. Alternatively, +factorization methods have been leveraged to incorporate low-rank compression +in the training process. Similarly, such techniques (e.g.,~SVD) frequently rely +on the computationally expensive decomposition of layers and are potentially +sub-optimal for non-linear models, such as DNNs. In this work, we take a +further step in designing efficient low-rank models and propose Maestro, a +framework for trainable low-rank layers. Instead of regularly applying a priori +decompositions such as SVD, the low-rank structure is built into the training +process through a generalized variant of Ordered Dropout. This method imposes +an importance ordering via sampling on the decomposed DNN structure. Our +theoretical analysis demonstrates that our method recovers the SVD +decomposition of linear mapping on uniformly distributed data and PCA for +linear autoencoders. We further apply our technique on DNNs and empirically +illustrate that Maestro enables the extraction of lower footprint models that +preserve model performance while allowing for graceful accuracy-latency +tradeoff for the deployment to devices of different capabilities. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Optimal Economic Gas Turbine Dispatch with Deep Reinforcement Learning + + +
+ Dispatching strategies for gas turbines (GTs) are changing in modern +electricity grids. A growing incorporation of intermittent renewable energy +requires GTs to operate more but shorter cycles and more frequently on partial +loads. Deep reinforcement learning (DRL) has recently emerged as a tool that +can cope with this development and dispatch GTs economically. The key +advantages of DRL are a model-free optimization and the ability to handle +uncertainties, such as those introduced by varying loads or renewable energy +production. In this study, three popular DRL algorithms are implemented for an +economic GT dispatch problem on a case study in Alberta, Canada. We highlight +the benefits of DRL by incorporating an existing thermodynamic software +provided by Siemens Energy into the environment model and by simulating +uncertainty via varying electricity prices, loads, and ambient conditions. +Among the tested algorithms and baseline methods, Deep Q-Networks (DQN) +obtained the highest rewards while Proximal Policy Optimization (PPO) was the +most sample efficient. We further propose and implement a method to assign GT +operation and maintenance cost dynamically based on operating hours and cycles. +Compared to existing methods, our approach better approximates the true cost of +modern GT dispatch and hence leads to more realistic policies. + +
+
+ comment: This work has been accepted to IFAC for publication under a Creative + Commons Licence CC-BY-NC-ND +
+
+
+
+
+ + ☆ Gender bias and stereotypes in Large Language Models + + +
+ Large Language Models (LLMs) have made substantial progress in the past +several months, shattering state-of-the-art benchmarks in many domains. This +paper investigates LLMs' behavior with respect to gender stereotypes, a known +issue for prior models. We use a simple paradigm to test the presence of gender +bias, building on but differing from WinoBias, a commonly used gender bias +dataset, which is likely to be included in the training data of current LLMs. +We test four recently published LLMs and demonstrate that they express biased +assumptions about men and women's occupations. Our contributions in this paper +are as follows: (a) LLMs are 3-6 times more likely to choose an occupation that +stereotypically aligns with a person's gender; (b) these choices align with +people's perceptions better than with the ground truth as reflected in official +job statistics; (c) LLMs in fact amplify the bias beyond what is reflected in +perceptions or the ground truth; (d) LLMs ignore crucial ambiguities in +sentence structure 95% of the time in our study items, but when explicitly +prompted, they recognize the ambiguity; (e) LLMs provide explanations for their +choices that are factually inaccurate and likely obscure the true reason behind +their predictions. That is, they provide rationalizations of their biased +behavior. This highlights a key property of these models: LLMs are trained on +imbalanced datasets; as such, even with the recent successes of reinforcement +learning with human feedback, they tend to reflect those imbalances back at us. +As with other types of societal biases, we suggest that LLMs must be carefully +tested to ensure that they treat minoritized individuals and communities +equitably. + +
+
+ comment: ACM Collective Intelligence +
+
+
+
+
+ + ☆ Matbench Discovery -- An evaluation framework for machine learning + crystal stability prediction + + +
+ Matbench Discovery simulates the deployment of machine learning (ML) energy +models in a high-throughput search for stable inorganic crystals. We address +the disconnect between (i) thermodynamic stability and formation energy and +(ii) in-domain vs out-of-distribution performance. Alongside this paper, we +publish a Python package to aid with future model submissions and a growing +online leaderboard with further insights into trade-offs between various +performance metrics. To answer the question which ML methodology performs best +at materials discovery, our initial release explores a variety of models +including random forests, graph neural networks (GNN), one-shot predictors, +iterative Bayesian optimizers and universal interatomic potentials (UIP). +Ranked best-to-worst by their test set F1 score on thermodynamic stability +prediction, we find CHGNet > M3GNet > MACE > ALIGNN > MEGNet > CGCNN > CGCNN+P +> Wrenformer > BOWSR > Voronoi tessellation fingerprints with random forest. +The top 3 models are UIPs, the winning methodology for ML-guided materials +discovery, achieving F1 scores of ~0.6 for crystal stability classification and +discovery acceleration factors (DAF) of up to 5x on the first 10k most stable +predictions compared to dummy selection from our test set. We also highlight a +sharp disconnect between commonly used global regression metrics and more +task-relevant classification metrics. Accurate regressors are susceptible to +unexpectedly high false-positive rates if those accurate predictions lie close +to the decision boundary at 0 eV/atom above the convex hull where most +materials are. Our results highlight the need to focus on classification +metrics that actually correlate with improved stability hit rate. + +
+
+ comment: 18 pages, 9 figures, 3 tables +
+
+
+
+
+ + ☆ On Reward Structures of Markov Decision Processes + + +
+ A Markov decision process can be parameterized by a transition kernel and a +reward function. Both play essential roles in the study of reinforcement +learning as evidenced by their presence in the Bellman equations. In our +inquiry of various kinds of ``costs'' associated with reinforcement learning +inspired by the demands in robotic applications, rewards are central to +understanding the structure of a Markov decision process and reward-centric +notions can elucidate important concepts in reinforcement learning. +Specifically, we studied the sample complexity of policy evaluation and +developed a novel estimator with an instance-specific error bound of +$\tilde{O}(\sqrt{\frac{\tau_s}{n}})$ for estimating a single state value. Under +the online regret minimization setting, we refined the transition-based MDP +constant, diameter, into a reward-based constant, maximum expected hitting +cost, and with it, provided a theoretical explanation for how a well-known +technique, potential-based reward shaping, could accelerate learning with +expert knowledge. In an attempt to study safe reinforcement learning, we +modeled hazardous environments with irrecoverability and proposed a +quantitative notion of safe learning via reset efficiency. In this setting, we +modified a classic algorithm to account for resets achieving promising +preliminary numerical results. Lastly, for MDPs with multiple reward functions, +we developed a planning algorithm that computationally efficiently finds Pareto +optimal stochastic policies. + +
+
+ comment: This PhD thesis draws heavily from arXiv:1907.02114 and + arXiv:2002.06299 +
+
+
+
+
+ + ☆ RecRec: Algorithmic Recourse for Recommender Systems CIKM 2023 + + +
+ Recommender systems play an essential role in the choices people make in +domains such as entertainment, shopping, food, news, employment, and education. +The machine learning models underlying these recommender systems are often +enormously large and black-box in nature for users, content providers, and +system developers alike. It is often crucial for all stakeholders to understand +the model's rationale behind making certain predictions and recommendations. +This is especially true for the content providers whose livelihoods depend on +the recommender system. Drawing motivation from the practitioners' need, in +this work, we propose a recourse framework for recommender systems, targeted +towards the content providers. Algorithmic recourse in the recommendation +setting is a set of actions that, if executed, would modify the recommendations +(or ranking) of an item in the desired manner. A recourse suggests actions of +the form: "if a feature changes X to Y, then the ranking of that item for a set +of users will change to Z." Furthermore, we demonstrate that RecRec is highly +effective in generating valid, sparse, and actionable recourses through an +empirical evaluation of recommender systems trained on three real-world +datasets. To the best of our knowledge, this work is the first to conceptualize +and empirically test a generalized framework for generating recourses for +recommender systems. + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Pruning Self-Attention for Zero-Shot Multi-Speaker Text-to-Speech INTERSPEECH 2023 + + +
+ For personalized speech generation, a neural text-to-speech (TTS) model must +be successfully implemented with limited data from a target speaker. To this +end, the baseline TTS model needs to be amply generalized to out-of-domain data +(i.e., target speaker's speech). However, approaches to address this +out-of-domain generalization problem in TTS have yet to be thoroughly studied. +In this work, we propose an effective pruning method for a transformer known as +sparse attention, to improve the TTS model's generalization abilities. In +particular, we prune off redundant connections from self-attention layers whose +attention weights are below the threshold. To flexibly determine the pruning +strength for searching optimal degree of generalization, we also propose a new +differentiable pruning method that allows the model to automatically learn the +thresholds. Evaluations on zero-shot multi-speaker TTS verify the effectiveness +of our method in terms of voice quality and speaker similarity. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ BayOTIDE: Bayesian Online Multivariate Time series Imputation with + functional decomposition + + +
+ In real-world scenarios like traffic and energy, massive time-series data +with missing values and noises are widely observed, even sampled irregularly. +While many imputation methods have been proposed, most of them work with a +local horizon, which means models are trained by splitting the long sequence +into batches of fit-sized patches. This local horizon can make models ignore +global trends or periodic patterns. More importantly, almost all methods assume +the observations are sampled at regular time stamps, and fail to handle complex +irregular sampled time series arising from different applications. Thirdly, +most existing methods are learned in an offline manner. Thus, it is not +suitable for many applications with fast-arriving streaming data. To overcome +these limitations, we propose \ours: Bayesian Online Multivariate Time series +Imputation with functional decomposition. We treat the multivariate time series +as the weighted combination of groups of low-rank temporal factors with +different patterns. We apply a group of Gaussian Processes (GPs) with different +kernels as functional priors to fit the factors. For computational efficiency, +we further convert the GPs into a state-space prior by constructing an +equivalent stochastic differential equation (SDE), and developing a scalable +algorithm for online inference. The proposed method can not only handle +imputation over arbitrary time stamps, but also offer uncertainty +quantification and interpretability for the downstream application. We evaluate +our method on both synthetic and real-world datasets. + +
+
+
+
+
+ + ☆ Maturity-Aware Active Learning for Semantic Segmentation with + Hierarchically-Adaptive Sample Assessment BMVC 2023 + + +
+ Active Learning (AL) for semantic segmentation is challenging due to heavy +class imbalance and different ways of defining "sample" (pixels, areas, etc.), +leaving the interpretation of the data distribution ambiguous. We propose +"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL +method that benefits from a hierarchical approach to define a multiview data +distribution, which takes into account the different "sample" definitions +jointly, hence able to select the most impactful segmentation pixels with +comprehensive understanding. MADBAL also features a novel uncertainty +formulation, where AL supporting modules are included to sense the features' +maturity whose weighted influence continuously contributes to the uncertainty +detection. In this way, MADBAL makes significant performance leaps even in the +early AL stage, hence reducing the training burden significantly. It +outperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as +verified in our extensive experiments. + +
+
+ comment: Accepted to the 34th British Machine Vision Conference (BMVC 2023) +
+
+
+
+
+ + ☆ Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in + Recommendation Networks + + +
+ Recommendation models are vital in delivering personalized user experiences +by leveraging the correlation between multiple input features. However, deep +learning-based recommendation models often face challenges due to evolving user +behaviour and item features, leading to covariate shifts. Effective +cross-feature learning is crucial to handle data distribution drift and +adapting to changing user behaviour. Traditional feature interaction techniques +have limitations in achieving optimal performance in this context. + This work introduces Ad-Rec, an advanced network that leverages feature +interaction techniques to address covariate shifts. This helps eliminate +irrelevant interactions in recommendation tasks. Ad-Rec leverages masked +transformers to enable the learning of higher-order cross-features while +mitigating the impact of data distribution drift. Our approach improves model +quality, accelerates convergence, and reduces training time, as measured by the +Area Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its +ability to achieve superior model quality through comprehensive ablation +studies. + +
+
+
+
+
+ + ☆ Statistically Efficient Variance Reduction with Double Policy Estimation + for Off-Policy Evaluation in Sequence-Modeled Reinforcement Learning + + +
+ Offline reinforcement learning aims to utilize datasets of previously +gathered environment-action interaction records to learn a policy without +access to the real environment. Recent work has shown that offline +reinforcement learning can be formulated as a sequence modeling problem and +solved via supervised learning with approaches such as decision transformer. +While these sequence-based methods achieve competitive results over +return-to-go methods, especially on tasks that require longer episodes or with +scarce rewards, importance sampling is not considered to correct the policy +bias when dealing with off-policy data, mainly due to the absence of behavior +policy and the use of deterministic evaluation policies. To this end, we +propose DPE: an RL algorithm that blends offline sequence modeling and offline +reinforcement learning with Double Policy Estimation (DPE) in a unified +framework with statistically proven properties on variance reduction. We +validate our method in multiple tasks of OpenAI Gym with D4RL benchmarks. Our +method brings a performance improvements on selected methods which outperforms +SOTA baselines in several tasks, demonstrating the advantages of enabling +double policy estimation for sequence-modeled reinforcement learning. + +
+
+
+
+
+ + ☆ Conformal Meta-learners for Predictive Inference of Individual Treatment + Effects + + +
+ We investigate the problem of machine learning-based (ML) predictive +inference on individual treatment effects (ITEs). Previous work has focused +primarily on developing ML-based meta-learners that can provide point estimates +of the conditional average treatment effect (CATE); these are model-agnostic +approaches for combining intermediate nuisance estimates to produce estimates +of CATE. In this paper, we develop conformal meta-learners, a general framework +for issuing predictive intervals for ITEs by applying the standard conformal +prediction (CP) procedure on top of CATE meta-learners. We focus on a broad +class of meta-learners based on two-stage pseudo-outcome regression and develop +a stochastic ordering framework to study their validity. We show that inference +with conformal meta-learners is marginally valid if their (pseudo outcome) +conformity scores stochastically dominate oracle conformity scores evaluated on +the unobserved ITEs. Additionally, we prove that commonly used CATE +meta-learners, such as the doubly-robust learner, satisfy a model- and +distribution-free stochastic (or convex) dominance condition, making their +conformal inferences valid for practically-relevant levels of target coverage. +Whereas existing procedures conduct inference on nuisance parameters (i.e., +potential outcomes) via weighted CP, conformal meta-learners enable direct +inference on the target parameter (ITE). Numerical experiments show that +conformal meta-learners provide valid intervals with competitive efficiency +while retaining the favorable point estimation properties of CATE +meta-learners. + +
+
+
+
+
+ + ☆ When hard negative sampling meets supervised contrastive learning + + +
+ State-of-the-art image models predominantly follow a two-stage strategy: +pre-training on large datasets and fine-tuning with cross-entropy loss. Many +studies have shown that using cross-entropy can result in sub-optimal +generalisation and stability. While the supervised contrastive loss addresses +some limitations of cross-entropy loss by focusing on intra-class similarities +and inter-class differences, it neglects the importance of hard negative +mining. We propose that models will benefit from performance improvement by +weighting negative samples based on their dissimilarity to positive +counterparts. In this paper, we introduce a new supervised contrastive learning +objective, SCHaNe, which incorporates hard negative sampling during the +fine-tuning phase. Without requiring specialized architectures, additional +data, or extra computational resources, experimental results indicate that +SCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various +benchmarks, with significant gains of up to $3.32\%$ in few-shot learning +settings and $3.41\%$ in full dataset fine-tuning. Importantly, our proposed +objective sets a new state-of-the-art for base models on ImageNet-1k, achieving +an 86.14\% accuracy. Furthermore, we demonstrate that the proposed objective +yields better embeddings and explains the improved effectiveness observed in +our experiments. + +
+
+
+
+
+ + ♻ ☆ Decentralized Multi-Agent Reinforcement Learning with Global State + Prediction + + +
+ Deep reinforcement learning (DRL) has seen remarkable success in the control +of single robots. However, applying DRL to robot swarms presents significant +challenges. A critical challenge is non-stationarity, which occurs when two or +more robots update individual or shared policies concurrently, thereby engaging +in an interdependent training process with no guarantees of convergence. +Circumventing non-stationarity typically involves training the robots with +global information about other agents' states and/or actions. In contrast, in +this paper we explore how to remove the need for global information. We pose +our problem as a Partially Observable Markov Decision Process, due to the +absence of global knowledge on other agents. Using collective transport as a +testbed scenario, we study two approaches to multi-agent training. In the +first, the robots exchange no messages, and are trained to rely on implicit +communication through push-and-pull on the object to transport. In the second +approach, we introduce Global State Prediction (GSP), a network trained to +forma a belief over the swarm as a whole and predict its future states. We +provide a comprehensive study over four well-known deep reinforcement learning +algorithms in environments with obstacles, measuring performance as the +successful transport of the object to the goal within a desired time-frame. +Through an ablation study, we show that including GSP boosts performance and +increases robustness when compared with methods that use global knowledge. + +
+
+
+
+
+ + ♻ ☆ Revisiting mass-radius relationships for exoplanet populations: a + machine learning insight + + +
+ The growing number of exoplanet discoveries and advances in machine learning +techniques have opened new avenues for exploring and understanding the +characteristics of worlds beyond our Solar System. In this study, we employ +efficient machine learning approaches to analyze a dataset comprising 762 +confirmed exoplanets and eight Solar System planets, aiming to characterize +their fundamental quantities. By applying different unsupervised clustering +algorithms, we classify the data into two main classes: 'small' and 'giant' +planets, with cut-off values at $R_{p}=8.13R_{\oplus}$ and +$M_{p}=52.48M_{\oplus}$. This classification reveals an intriguing distinction: +giant planets have lower densities, suggesting higher H-He mass fractions, +while small planets are denser, composed mainly of heavier elements. We apply +various regression models to uncover correlations between physical parameters +and their predictive power for exoplanet radius. Our analysis highlights that +planetary mass, orbital period, and stellar mass play crucial roles in +predicting exoplanet radius. Among the models evaluated, the Support Vector +Regression consistently outperforms others, demonstrating its promise for +obtaining accurate planetary radius estimates. Furthermore, we derive +parametric equations using the M5P and Markov Chain Monte Carlo methods. +Notably, our study reveals a noteworthy result: small planets exhibit a +positive linear mass-radius relation, aligning with previous findings. +Conversely, for giant planets, we observe a strong correlation between +planetary radius and the mass of their host stars, which might provide +intriguing insights into the relationship between giant planet formation and +stellar characteristics. + +
+
+ comment: Accepted for publication in MNRAS. 17 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ The feasibility of artificial consciousness through the lens of + neuroscience + + +
+ Interactions with large language models have led to the suggestion that these +models may soon be conscious. From the perspective of neuroscience, this +position is difficult to defend. For one, the inputs to large language models +lack the embodied, embedded information content characteristic of our sensory +contact with the world around us. Secondly, the architecture of large language +models is missing key features of the thalamocortical system that have been +linked to conscious awareness in mammals. Finally, the evolutionary and +developmental trajectories that led to the emergence of living conscious +organisms arguably have no parallels in artificial systems as envisioned today. +The existence of living organisms depends on their actions, and their survival +is intricately linked to multi-level cellular, inter-cellular, and organismal +processes culminating in agency and consciousness. + +
+
+
+
+
+ + ♻ ☆ Reconstructing Spatiotemporal Data with C-VAEs + + +
+ The continuous representation of spatiotemporal data commonly relies on using +abstract data types, such as \textit{moving regions}, to represent entities +whose shape and position continuously change over time. Creating this +representation from discrete snapshots of real-world entities requires using +interpolation methods to compute in-between data representations and estimate +the position and shape of the object of interest at arbitrary temporal points. +Existing region interpolation methods often fail to generate smooth and +realistic representations of a region's evolution. However, recent advancements +in deep learning techniques have revealed the potential of deep models trained +on discrete observations to capture spatiotemporal dependencies through +implicit feature learning. + In this work, we explore the capabilities of Conditional Variational +Autoencoder (C-VAE) models to generate smooth and realistic representations of +the spatiotemporal evolution of moving regions. We evaluate our proposed +approach on a sparsely annotated dataset on the burnt area of a forest fire. We +apply compression operations to sample from the dataset and use the C-VAE model +and other commonly used interpolation algorithms to generate in-between region +representations. To evaluate the performance of the methods, we compare their +interpolation results with manually annotated data and regions generated by a +U-Net model. We also assess the quality of generated data considering temporal +consistency metrics. + The proposed C-VAE-based approach demonstrates competitive results in +geometric similarity metrics. It also exhibits superior temporal consistency, +suggesting that C-VAE models may be a viable alternative to modelling the +spatiotemporal evolution of 2D moving regions. + +
+
+ comment: Update acknowledgments to include published article information +
+
+
+
+
+ + ♻ ☆ Examining Policy Entropy of Reinforcement Learning Agents for + Personalization Tasks + + +
+ This effort is focused on examining the behavior of reinforcement learning +systems in personalization environments and detailing the differences in policy +entropy associated with the type of learning algorithm utilized. We demonstrate +that Policy Optimization agents often possess low-entropy policies during +training, which in practice results in agents prioritizing certain actions and +avoiding others. Conversely, we also show that Q-Learning agents are far less +susceptible to such behavior and generally maintain high-entropy policies +throughout training, which is often preferable in real-world applications. We +provide a wide range of numerical experiments as well as theoretical +justification to show that these differences in entropy are due to the type of +learning being employed. + +
+
+
+
+
+ + ♻ ☆ Wasserstein Geodesic Generator for Conditional Distributions + + +
+ Generating samples given a specific label requires estimating conditional +distributions. We derive a tractable upper bound of the Wasserstein distance +between conditional distributions to lay the theoretical groundwork to learn +conditional distributions. Based on this result, we propose a novel conditional +generation algorithm where conditional distributions are fully characterized by +a metric space defined by a statistical distance. We employ optimal transport +theory to propose the Wasserstein geodesic generator, a new conditional +generator that learns the Wasserstein geodesic. The proposed method learns both +conditional distributions for observed domains and optimal transport maps +between them. The conditional distributions given unobserved intermediate +domains are on the Wasserstein geodesic between conditional distributions given +two observed domain labels. Experiments on face images with light conditions as +domain labels demonstrate the efficacy of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Counterpart Fairness -- Addressing Systematic between-group Differences + in Fairness Evaluation + + +
+ When using machine learning (ML) to aid decision-making, it is critical to +ensure that an algorithmic decision is fair, i.e., it does not discriminate +against specific individuals/groups, particularly those from underprivileged +populations. Existing group fairness methods require equal group-wise measures, +which however fails to consider systematic between-group differences. The +confounding factors, which are non-sensitive variables but manifest systematic +differences, can significantly affect fairness evaluation. To tackle this +problem, we believe that a fairness measurement should be based on the +comparison between counterparts (i.e., individuals who are similar to each +other with respect to the task of interest) from different groups, whose group +identities cannot be distinguished algorithmically by exploring confounding +factors. We have developed a propensity-score-based method for identifying +counterparts, which prevents fairness evaluation from comparing "oranges" with +"apples". In addition, we propose a counterpart-based statistical fairness +index, termed Counterpart-Fairness (CFair), to assess fairness of ML models. +Various empirical studies were conducted to validate the effectiveness of +CFair. We publish our code at \url{https://github.com/zhengyjo/CFair}. + +
+
+ comment: 25 pages, 6 figures, 16 tables +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning with Delayed, Composite, and Partially Anonymous + Reward + + +
+ We investigate an infinite-horizon average reward Markov Decision Process +(MDP) with delayed, composite, and partially anonymous reward feedback. The +delay and compositeness of rewards mean that rewards generated as a result of +taking an action at a given state are fragmented into different components, and +they are sequentially realized at delayed time instances. The partial anonymity +attribute implies that a learner, for each state, only observes the aggregate +of past reward components generated as a result of different actions taken at +that state, but realized at the observation instance. We propose an algorithm +named $\mathrm{DUCRL2}$ to obtain a near-optimal policy for this setting and +show that it achieves a regret bound of $\tilde{\mathcal{O}}\left(DS\sqrt{AT} + +d (SA)^3\right)$ where $S$ and $A$ are the sizes of the state and action +spaces, respectively, $D$ is the diameter of the MDP, $d$ is a parameter upper +bounded by the maximum reward delay, and $T$ denotes the time horizon. This +demonstrates the optimality of the bound in the order of $T$, and an additive +impact of the delay. + +
+
+
+
+
+ + ♻ ☆ Safety Filter Design for Neural Network Systems via Convex Optimization + + +
+ With the increase in data availability, it has been widely demonstrated that +neural networks (NN) can capture complex system dynamics precisely in a +data-driven manner. However, the architectural complexity and nonlinearity of +the NNs make it challenging to synthesize a provably safe controller. In this +work, we propose a novel safety filter that relies on convex optimization to +ensure safety for a NN system, subject to additive disturbances that are +capable of capturing modeling errors. Our approach leverages tools from NN +verification to over-approximate NN dynamics with a set of linear bounds, +followed by an application of robust linear MPC to search for controllers that +can guarantee robust constraint satisfaction. We demonstrate the efficacy of +the proposed framework numerically on a nonlinear pendulum system. + +
+
+ comment: This paper has been accepted to the 2023 62nd IEEE Conference on + Decision and Control (CDC) +
+
+
+
+
+ + ♻ ☆ End-to-End Reinforcement Learning of Koopman Models for Economic + Nonlinear Model Predictive Control + + +
+ (Economic) nonlinear model predictive control ((e)NMPC) requires dynamic +system models that are sufficiently accurate in all relevant state-space +regions. These models must also be computationally cheap enough to ensure +real-time tractability. Data-driven surrogate models for mechanistic models can +be used to reduce the computational burden of (e)NMPC; however, such models are +typically trained by system identification for maximum average prediction +accuracy on simulation samples and perform suboptimally as part of actual +(e)NMPC. We present a method for end-to-end reinforcement learning of dynamic +surrogate models for optimal performance in (e)NMPC applications, resulting in +predictive controllers that strike a favorable balance between control +performance and computational demand. We validate our method on two +applications derived from an established nonlinear continuous stirred-tank +reactor model. We compare the controller performance to that of MPCs utilizing +models trained by the prevailing maximum prediction accuracy paradigm, and +model-free neural network controllers trained using reinforcement learning. We +show that our method matches the performance of the model-free neural network +controllers while consistently outperforming models derived from system +identification. Additionally, we show that the MPC policies can react to +changes in the control setting without retraining. + +
+
+ comment: manuscript (18 pages, 7 figures, 5 tables), supplementary materials + (3 pages, 2 tables) +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3 + (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Map-based Experience Replay: A Memory-Efficient Solution to Catastrophic + Forgetting in Reinforcement Learning + + +
+ Deep Reinforcement Learning agents often suffer from catastrophic forgetting, +forgetting previously found solutions in parts of the input space when training +on new data. Replay Memories are a common solution to the problem, +decorrelating and shuffling old and new training samples. They naively store +state transitions as they come in, without regard for redundancy. We introduce +a novel cognitive-inspired replay memory approach based on the +Grow-When-Required (GWR) self-organizing network, which resembles a map-based +mental model of the world. Our approach organizes stored transitions into a +concise environment-model-like network of state-nodes and transition-edges, +merging similar samples to reduce the memory size and increase pair-wise +distance among samples, which increases the relevancy of each sample. Overall, +our paper shows that map-based experience replay allows for significant memory +reduction with only small performance decreases. + +
+
+
+
+
+ + ♻ ☆ QuadConv: Quadrature-Based Convolutions with Applications to Non-Uniform + PDE Data Compression + + +
+ We present a new convolution layer for deep learning architectures which we +call QuadConv -- an approximation to continuous convolution via quadrature. Our +operator is developed explicitly for use on non-uniform, mesh-based data, and +accomplishes this by learning a continuous kernel that can be sampled at +arbitrary locations. Moreover, the construction of our operator admits an +efficient implementation which we detail and construct. As an experimental +validation of our operator, we consider the task of compressing partial +differential equation (PDE) simulation data from fixed meshes. We show that +QuadConv can match the performance of standard discrete convolutions on uniform +grid data by comparing a QuadConv autoencoder (QCAE) to a standard +convolutional autoencoder (CAE). Further, we show that the QCAE can maintain +this accuracy even on non-uniform data. In both cases, QuadConv also +outperforms alternative unstructured convolution methods such as graph +convolution. + +
+
+ comment: 26 pages, 18 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ SimFBO: Towards Simple, Flexible and Communication-efficient Federated + Bilevel Learning + + +
+ Federated bilevel optimization (FBO) has shown great potential recently in +machine learning and edge computing due to the emerging nested optimization +structure in meta-learning, fine-tuning, hyperparameter tuning, etc. However, +existing FBO algorithms often involve complicated computations and require +multiple sub-loops per iteration, each of which contains a number of +communication rounds. In this paper, we propose a simple and flexible FBO +framework named SimFBO, which is easy to implement without sub-loops, and +includes a generalized server-side aggregation and update for improving +communication efficiency. We further propose System-level heterogeneity robust +FBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous +local computation. We show that SimFBO and ShroFBO provably achieve a linear +convergence speedup with partial client participation and client sampling +without replacement, as well as improved sample and communication complexities. +Experiments demonstrate the effectiveness of the proposed methods over existing +FBO algorithms. + +
+
+
+
+
+ + ♻ ☆ An active learning method for solving competitive multi-agent + decision-making and control problems + + +
+ We propose a scheme based on active learning to reconstruct private +strategies executed by a population of interacting agents and predict an exact +outcome of the underlying multi-agent interaction process, here identified as a +stationary action profile. We envision a scenario where an external observer, +endowed with a learning procedure, can make queries and observe the agents' +reactions through private action-reaction mappings, whose collective fixed +point corresponds to a stationary profile. By iteratively collecting sensible +data and updating parametric estimates of the action-reaction mappings, we +establish sufficient conditions to assess the asymptotic properties of the +proposed active learning methodology so that, if convergence happens, it can +only be towards a stationary action profile. This fact yields two main +consequences: i) learning locally-exact surrogates of the action-reaction +mappings allows the external observer to succeed in its prediction task, and +ii) working with assumptions so general that a stationary profile is not even +guaranteed to exist, the established sufficient conditions hence act also as +certificates for the existence of such a desirable profile. Extensive numerical +simulations involving typical competitive multi-agent control and +decision-making problems illustrate the practical effectiveness of the proposed +learning-based approach. + +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning for Wind and Energy Storage Coordination in + Wholesale Energy and Ancillary Service Markets + + +
+ Wind energy has been increasingly adopted to mitigate climate change. +However, the variability of wind energy causes wind curtailment, resulting in +considerable economic losses for wind farm owners. Wind curtailment can be +reduced using battery energy storage systems (BESS) as onsite backup sources. +Yet, this auxiliary role may significantly weaken the economic potential of +BESS in energy trading. Ideal BESS scheduling should balance onsite wind +curtailment reduction and market bidding, but practical implementation is +challenging due to coordination complexity and the stochastic nature of energy +prices and wind generation. We investigate the joint-market bidding strategy of +a co-located wind-battery system in the spot and Regulation Frequency Control +Ancillary Service markets. We propose a novel deep reinforcement learning-based +approach that decouples the system's market participation into two related +Markov decision processes for each facility, enabling the BESS to absorb onsite +wind curtailment while performing joint-market bidding to maximize overall +operational revenues. Using realistic wind farm data, we validated the +coordinated bidding strategy, with outcomes surpassing the optimization-based +benchmark in terms of higher revenue by approximately 25\% and more wind +curtailment reduction by 2.3 times. Our results show that joint-market bidding +can significantly improve the financial performance of wind-battery systems +compared to participating in each market separately. Simulations also show that +using curtailed wind generation as a power source for charging the BESS can +lead to additional financial gains. The successful implementation of our +algorithm would encourage co-location of generation and storage assets to +unlock wider system benefits. + +
+
+
+
+
+ + ♻ ☆ Enhancing Agent Communication and Learning through Action and Language + + +
+ We introduce a novel category of GC-agents capable of functioning as both +teachers and learners. Leveraging action-based demonstrations and +language-based instructions, these agents enhance communication efficiency. We +investigate the incorporation of pedagogy and pragmatism, essential elements in +human communication and goal achievement, enhancing the agents' teaching and +learning capabilities. Furthermore, we explore the impact of combining +communication modes (action and language) on learning outcomes, highlighting +the benefits of a multi-modal approach. + +
+
+ comment: IMOL workshop, Paris 2023 +
+
+
+
+
+ + ♻ ☆ Secure & Private Federated Neuroimaging + + +
+ The amount of biomedical data continues to grow rapidly. However, collecting +data from multiple sites for joint analysis remains challenging due to +security, privacy, and regulatory concerns. To overcome this challenge, we use +Federated Learning, which enables distributed training of neural network models +over multiple data sources without sharing data. Each site trains the neural +network over its private data for some time, then shares the neural network +parameters (i.e., weights, gradients) with a Federation Controller, which in +turn aggregates the local models, sends the resulting community model back to +each site, and the process repeats. Our Federated Learning architecture, +MetisFL, provides strong security and privacy. First, sample data never leaves +a site. Second, neural network parameters are encrypted before transmission and +the global neural model is computed under fully-homomorphic encryption. +Finally, we use information-theoretic methods to limit information leakage from +the neural model to prevent a curious site from performing model inversion or +membership attacks. We present a thorough evaluation of the performance of +secure, private federated learning in neuroimaging tasks, including for +predicting Alzheimer's disease and estimating BrainAGE from magnetic resonance +imaging (MRI) studies, in challenging, heterogeneous federated environments +where sites have different amounts of data and statistical distributions. + +
+
+ comment: 18 pages, 13 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ inTformer: A Time-Embedded Attention-Based Transformer for Crash + Likelihood Prediction at Intersections Using Connected Vehicle Data + + +
+ The real-time crash likelihood prediction model is an essential component of +the proactive traffic safety management system. Over the years, numerous +studies have attempted to construct a crash likelihood prediction model in +order to enhance traffic safety, but mostly on freeways. In the majority of the +existing studies, researchers have primarily employed a deep learning-based +framework to identify crash potential. Lately, Transformer has emerged as a +potential deep neural network that fundamentally operates through +attention-based mechanisms. Transformer has several functional benefits over +extant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can +readily handle long-term dependencies in a data sequence. Secondly, +Transformers can parallelly process all elements in a data sequence during +training. Finally, a Transformer does not have the vanishing gradient issue. +Realizing the immense possibility of Transformers, this paper proposes +inTersection-Transformer (inTformer), a time-embedded attention-based +Transformer model that can effectively predict intersection crash likelihood in +real-time. The proposed model was evaluated using connected vehicle data +extracted from Signal Analytics Platform. Acknowledging the complex traffic +operation mechanism at intersection, this study developed zone-specific models +by dividing the intersection region into two distinct zones: +within-intersection and approach zone. The best inTformer models in +'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and +70%, respectively. The zone-level models were also compared to earlier studies +on crash likelihood prediction at intersections and with several established +deep learning models trained on the same connected vehicle dataset. + +
+
+ comment: 29 pages, 10 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ NNP/MM: Accelerating molecular dynamics simulations with machine + learning potentials and molecular mechanic + + +
+ Machine learning potentials have emerged as a means to enhance the accuracy +of biomolecular simulations. However, their application is constrained by the +significant computational cost arising from the vast number of parameters +compared to traditional molecular mechanics. To tackle this issue, we introduce +an optimized implementation of the hybrid method (NNP/MM), which combines +neural network potentials (NNP) and molecular mechanics (MM). This approach +models a portion of the system, such as a small molecule, using NNP while +employing MM for the remaining system to boost efficiency. By conducting +molecular dynamics (MD) simulations on various protein-ligand complexes and +metadynamics (MTD) simulations on a ligand, we showcase the capabilities of our +implementation of NNP/MM. It has enabled us to increase the simulation speed by +5 times and achieve a combined sampling of one microsecond for each complex, +marking the longest simulations ever reported for this class of simulation. + +
+
+
+
+
+ + ♻ ☆ No Fear of Classifier Biases: Neural Collapse Inspired Federated + Learning with Synthetic and Fixed Classifier ICCV 2023 + + +
+ Data heterogeneity is an inherent challenge that hinders the performance of +federated learning (FL). Recent studies have identified the biased classifiers +of local models as the key bottleneck. Previous attempts have used classifier +calibration after FL training, but this approach falls short in improving the +poor feature representations caused by training-time classifier biases. +Resolving the classifier bias dilemma in FL requires a full understanding of +the mechanisms behind the classifier. Recent advances in neural collapse have +shown that the classifiers and feature prototypes under perfect training +scenarios collapse into an optimal structure called simplex equiangular tight +frame (ETF). Building on this neural collapse insight, we propose a solution to +the FL's classifier bias problem by utilizing a synthetic and fixed ETF +classifier during training. The optimal classifier structure enables all +clients to learn unified and optimal feature representations even under +extremely heterogeneous data. We devise several effective modules to better +adapt the ETF structure in FL, achieving both high generalization and +personalization. Extensive experiments demonstrate that our method achieves +state-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Invariant Lipschitz Bandits: A Side Observation Approach + + +
+ Symmetry arises in many optimization and decision-making problems, and has +attracted considerable attention from the optimization community: By utilizing +the existence of such symmetries, the process of searching for optimal +solutions can be improved significantly. Despite its success in (offline) +optimization, the utilization of symmetries has not been well examined within +the online optimization settings, especially in the bandit literature. As such, +in this paper we study the invariant Lipschitz bandit setting, a subclass of +the Lipschitz bandits where the reward function and the set of arms are +preserved under a group of transformations. We introduce an algorithm named +\texttt{UniformMesh-N}, which naturally integrates side observations using +group orbits into the \texttt{UniformMesh} algorithm +(\cite{Kleinberg2005_UniformMesh}), which uniformly discretizes the set of +arms. Using the side-observation approach, we prove an improved regret upper +bound, which depends on the cardinality of the group, given that the group is +finite. We also prove a matching regret's lower bound for the invariant +Lipschitz bandit class (up to logarithmic factors). We hope that our work will +ignite further investigation of symmetry in bandit theory and sequential +decision-making theory in general. + +
+
+
+
+
+ + ♻ ☆ A noise-robust acoustic method for recognizing foraging activities of + grazing cattle + + +
+ Farmers must continuously improve their livestock production systems to +remain competitive in the growing dairy market. Precision livestock farming +technologies provide individualized monitoring of animals on commercial farms, +optimizing livestock production. Continuous acoustic monitoring is a widely +accepted sensing technique used to estimate the daily rumination and grazing +time budget of free-ranging cattle. However, typical environmental and natural +noises on pastures noticeably affect the performance limiting the practical +application of current acoustic methods. In this study, we present the +operating principle and generalization capability of an acoustic method called +Noise-Robust Foraging Activity Recognizer (NRFAR). The proposed method +determines foraging activity bouts by analyzing fixed-length segments of +identified jaw movement events produced during grazing and rumination. The +additive noise robustness of the NRFAR was evaluated for several +signal-to-noise ratios using stationary Gaussian white noise and four different +nonstationary natural noise sources. In noiseless conditions, NRFAR reached an +average balanced accuracy of 86.4%, outperforming two previous acoustic methods +by more than 7.5%. Furthermore, NRFAR performed better than previous acoustic +methods in 77 of 80 evaluated noisy scenarios (53 cases with p<0.05). NRFAR has +been shown to be effective in harsh free-ranging environments and could be used +as a reliable solution to improve pasture management and monitor the health and +welfare of dairy cows. The instrumentation and computational algorithms +presented in this publication are protected by a pending patent application: AR +P20220100910. Web demo available at: https://sinc.unl.edu.ar/web-demo/nrfar + +
+
+ comment: list of used audio-clips is available in the list_audio_clips.xlsx +
+
+
+
+
+ + ♻ ☆ Differentiable Constrained Imitation Learning for Robot Motion Planning + and Control IROS 2023 + + +
+ Motion planning and control are crucial components of robotics applications +like automated driving. Here, spatio-temporal hard constraints like system +dynamics and safety boundaries (e.g., obstacles) restrict the robot's motions. +Direct methods from optimal control solve a constrained optimization problem. +However, in many applications finding a proper cost function is inherently +difficult because of the weighting of partially conflicting objectives. On the +other hand, Imitation Learning (IL) methods such as Behavior Cloning (BC) +provide an intuitive framework for learning decision-making from offline +demonstrations and constitute a promising avenue for planning and control in +complex robot applications. Prior work primarily relied on soft constraint +approaches, which use additional auxiliary loss terms describing the +constraints. However, catastrophic safety-critical failures might occur in +out-of-distribution (OOD) scenarios. This work integrates the flexibility of IL +with hard constraint handling in optimal control. Our approach constitutes a +general framework for constraint robotic motion planning and control, as well +as traffic agent simulation, whereas we focus on mobile robot and automated +driving applications. Hard constraints are integrated into the learning problem +in a differentiable manner, via explicit completion and gradient-based +correction. Simulated experiments of mobile robot navigation and automated +driving provide evidence for the performance of the proposed method. + +
+
+ comment: International Conference on Intelligent Robots and Systems Agents4AD + Workshop, IROS 2023 +
+
+
+
+
+ + ♻ ☆ Sufficient Invariant Learning for Distribution Shift + + +
+ Machine learning algorithms have shown remarkable performance in diverse +applications. However, it is still challenging to guarantee performance in +distribution shifts when distributions of training and test datasets are +different. There have been several approaches to improve the performance in +distribution shift cases by learning invariant features across groups or +domains. However, we observe that the previous works only learn invariant +features partially. While the prior works focus on the limited invariant +features, we first raise the importance of the sufficient invariant features. +Since only training sets are given empirically, the learned partial invariant +features from training sets might not be present in the test sets under +distribution shift. Therefore, the performance improvement on distribution +shifts might be limited. In this paper, we argue that learning sufficient +invariant features from the training set is crucial for the distribution shift +case. Concretely, we newly observe the connection between a) sufficient +invariant features and b) flatness differences between groups or domains. +Moreover, we propose a new algorithm, Adaptive Sharpness-aware Group +Distributionally Robust Optimization (ASGDRO), to learn sufficient invariant +features across domains or groups. ASGDRO learns sufficient invariant features +by seeking common flat minima across all groups or domains. Therefore, ASGDRO +improves the performance on diverse distribution shift cases. Besides, we +provide a new simple dataset, Heterogeneous-CMNIST, to diagnose whether the +various algorithms learn sufficient invariant features. + +
+
+
+
+
+ + ♻ ☆ MKL-$L_{0/1}$-SVM + + +
+ This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework +for the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some +KKT-like first-order optimality conditions are provided and then exploited to +develop a fast ADMM algorithm to solve the nonsmooth nonconvex optimization +problem. Numerical experiments on synthetic and real datasets show that the +performance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading +approaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and +Grandvalet [Journal of Machine Learning Research, vol.~9, pp.~2491--2521, +2008]. + +
+
+ comment: 26 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin + note: substantial text overlap with arXiv:2303.04445 +
+
+
+
+
+ + ♻ ☆ Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in + First Trimester 3D Ultrasound + + +
+ Segmentation and spatial alignment of ultrasound (US) imaging data acquired +in the in first trimester are crucial for monitoring human embryonic growth and +development throughout this crucial period of life. Current approaches are +either manual or semi-automatic and are therefore very time-consuming and prone +to errors. To automate these tasks, we propose a multi-atlas framework for +automatic segmentation and spatial alignment of the embryo using deep learning +with minimal supervision. Our framework learns to register the embryo to an +atlas, which consists of the US images acquired at a range of gestational age +(GA), segmented and spatially aligned to a predefined standard orientation. +From this, we can derive the segmentation of the embryo and put the embryo in +standard orientation. US images acquired at 8+0 till 12+6 weeks GA were used +and eight subjects were selected as atlas. We evaluated different fusion +strategies to incorporate multiple atlases: 1) training the framework using +atlas images from a single subject, 2) training the framework with data of all +available atlases and 3) ensembling of the frameworks trained per subject. To +evaluate the performance, we calculated the Dice score over the test set. We +found that training the framework using all available atlases outperformed +ensembling and gave similar results compared to the best of all frameworks +trained on a single subject. Furthermore, we found that selecting images from +the four atlases closest in GA out of all available atlases, regardless of the +individual quality, gave the best results with a median Dice score of 0.72. We +conclude that our framework can accurately segment and spatially align the +embryo in first trimester 3D US images and is robust for the variation in +quality that existed in the available atlases. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html +
+
+
+
+
+ + ♻ ☆ Heterogeneous Decentralized Machine Unlearning with Seed Model + Distillation + + +
+ As some recent information security legislation endowed users with +unconditional rights to be forgotten by any trained machine learning model, +personalized IoT service providers have to put unlearning functionality into +their consideration. The most straightforward method to unlearn users' +contribution is to retrain the model from the initial state, which is not +realistic in high throughput applications with frequent unlearning requests. +Though some machine unlearning frameworks have been proposed to speed up the +retraining process, they fail to match decentralized learning scenarios. In +this paper, we design a decentralized unlearning framework called HDUS, which +uses distilled seed models to construct erasable ensembles for all clients. +Moreover, the framework is compatible with heterogeneous on-device models, +representing stronger scalability in real-world applications. Extensive +experiments on three real-world datasets show that our HDUS achieves +state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ The Re-Label Method For Data-Centric Machine Learning + + +
+ In industry deep learning application, our manually labeled data has a +certain number of noisy data. To solve this problem and achieve more than 90 +score in dev dataset, we present a simple method to find the noisy data and +re-label the noisy data by human, given the model predictions as references in +human labeling. In this paper, we illustrate our idea for a broad set of deep +learning tasks, includes classification, sequence tagging, object detection, +sequence generation, click-through rate prediction. The experimental results +and human evaluation results verify our idea. + +
+
+
+
+
+ + ♻ ☆ A probabilistic Taylor expansion with Gaussian processes + + +
+ We study a class of Gaussian processes for which the posterior mean, for a +particular choice of data, replicates a truncated Taylor expansion of any +order. The data consist of derivative evaluations at the expansion point and +the prior covariance kernel belongs to the class of Taylor kernels, which can +be written in a certain power series form. We discuss and prove some results on +maximum likelihood estimation of parameters of Taylor kernels. The proposed +framework is a special case of Gaussian process regression based on data that +is orthogonal in the reproducing kernel Hilbert space of the covariance kernel. + +
+
+ comment: To appear in Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Benign Autoencoders + + +
+ Recent progress in Generative Artificial Intelligence (AI) relies on +efficient data representations, often featuring encoder-decoder architectures. +We formalize the mathematical problem of finding the optimal encoder-decoder +pair and characterize its solution, which we name the "benign autoencoder" +(BAE). We prove that BAE projects data onto a manifold whose dimension is the +optimal compressibility dimension of the generative problem. We highlight +surprising connections between BAE and several recent developments in AI, such +as conditional GANs, context encoders, stable diffusion, stacked autoencoders, +and the learning capabilities of generative models. As an illustration, we show +how BAE can find optimal, low-dimensional latent representations that improve +the performance of a discriminator under a distribution shift. By compressing +"malignant" data dimensions, BAE leads to smoother and more stable gradients. + +
+
+ comment: This paper replaces and subsumes arXiv:2110.08884 +
+
+
+
+
+ + ♻ ☆ MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large + Language Models + + +
+ LLMs usually exhibit limitations in their ability to incorporate new +knowledge, the generation of hallucinations, and the transparency of their +decision-making process. In this paper, we explore how to prompt LLMs with +knowledge graphs (KG), working as a remedy to engage LLMs with up-to-date +knowledge and elicit the reasoning pathways from LLMs. Specifically, we build a +prompting pipeline that endows LLMs with the capability of comprehending KG +inputs and inferring with a combined implicit knowledge and the retrieved +external knowledge. In addition, we investigate eliciting the mind map on which +LLMs perform the reasoning and generate the answers. It is identified that the +produced mind map exhibits the reasoning pathways of LLMs grounded on the +ontology of knowledge, hence bringing the prospects of probing and gauging LLM +inference in production. The experiments on three question & answering datasets +also show that MindMap prompting leads to a striking empirical gain. For +instance, prompting a GPT-3.5 with MindMap yields an overwhelming performance +over GPT-4 consistently. We also demonstrate that with structured facts +retrieved from KG, MindMap can outperform a series of +prompting-with-document-retrieval methods, benefiting from more accurate, +concise, and comprehensive knowledge from KGs. + +
+
+ comment: 7 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ On the Robustness of Random Forest Against Untargeted Data Poisoning: An + Ensemble-Based Approach + + +
+ Machine learning is becoming ubiquitous. From finance to medicine, machine +learning models are boosting decision-making processes and even outperforming +humans in some tasks. This huge progress in terms of prediction quality does +not however find a counterpart in the security of such models and corresponding +predictions, where perturbations of fractions of the training set (poisoning) +can seriously undermine the model accuracy. Research on poisoning attacks and +defenses received increasing attention in the last decade, leading to several +promising solutions aiming to increase the robustness of machine learning. +Among them, ensemble-based defenses, where different models are trained on +portions of the training set and their predictions are then aggregated, provide +strong theoretical guarantees at the price of a linear overhead. Surprisingly, +ensemble-based defenses, which do not pose any restrictions on the base model, +have not been applied to increase the robustness of random forest models. The +work in this paper aims to fill in this gap by designing and implementing a +novel hash-based ensemble approach that protects random forest against +untargeted, random poisoning attacks. An extensive experimental evaluation +measures the performance of our approach against a variety of attacks, as well +as its sustainability in terms of resource consumption and performance, and +compares it with a traditional monolithic model based on random forest. A final +discussion presents our main findings and compares our approach with existing +poisoning defenses targeting random forests. + +
+
+ comment: Accepted in IEEE Transactions on Sustainable Computing; 15 pages, 8 + figures +
+
+
+
+
+ + ♻ ☆ Interpolation for Robust Learning: Data Augmentation on Wasserstein + Geodesics + + +
+ We propose to study and promote the robustness of a model as per its +performance through the interpolation of training data distributions. +Specifically, (1) we augment the data by finding the worst-case Wasserstein +barycenter on the geodesic connecting subpopulation distributions of different +categories. (2) We regularize the model for smoother performance on the +continuous geodesic path connecting subpopulation distributions. (3) +Additionally, we provide a theoretical guarantee of robustness improvement and +investigate how the geodesic location and the sample size contribute, +respectively. Experimental validations of the proposed strategy on +\textit{four} datasets, including CIFAR-100 and ImageNet, establish the +efficacy of our method, e.g., our method improves the baselines' certifiable +robustness on CIFAR10 up to $7.7\%$, with $16.8\%$ on empirical robustness on +CIFAR-100. Our work provides a new perspective of model robustness through the +lens of Wasserstein geodesic-based interpolation with a practical off-the-shelf +strategy that can be combined with existing robust training methods. + +
+
+ comment: 34 pages, 3 figures, 18 tables +
+
+
+
+
+ + ♻ ☆ Deep Unfolding-based Weighted Averaging for Federated Learning in + Heterogeneous Environments + + +
+ Federated learning is a collaborative model training method that iterates +model updates by multiple clients and aggregation of the updates by a central +server. Device and statistical heterogeneity of participating clients cause +significant performance degradation so that an appropriate aggregation weight +should be assigned to each client in the aggregation phase of the server. To +adjust the aggregation weights, this paper employs deep unfolding, which is +known as the parameter tuning method that leverages both learning capability +using training data like deep learning and domain knowledge. This enables us to +directly incorporate the heterogeneity of the environment of interest into the +tuning of the aggregation weights. The proposed approach can be combined with +various federated learning algorithms. The results of numerical experiments +indicate that a higher test accuracy for unknown class-balanced data can be +obtained with the proposed method than that with conventional heuristic +weighting methods. The proposed method can handle large-scale learning models +with the aid of pretrained models such that it can perform practical real-world +tasks. Convergence rate of federated learning algorithms with the proposed +method is also provided in this paper. + +
+
+
+
+
+ + ♻ ☆ Functional optimal transport: map estimation and domain adaptation for + functional data + + +
+ We introduce a formulation of optimal transport problem for distributions on +function spaces, where the stochastic map between functional domains can be +partially represented in terms of an (infinite-dimensional) Hilbert-Schmidt +operator mapping a Hilbert space of functions to another. For numerous machine +learning tasks, data can be naturally viewed as samples drawn from spaces of +functions, such as curves and surfaces, in high dimensions. Optimal transport +for functional data analysis provides a useful framework of treatment for such +domains. { Since probability measures in infinite dimensional spaces generally +lack absolute continuity (that is, with respect to non-degenerate Gaussian +measures), the Monge map in the standard optimal transport theory for finite +dimensional spaces may not exist. Our approach to the optimal transport problem +in infinite dimensions is by a suitable regularization technique -- we restrict +the class of transport maps to be a Hilbert-Schmidt space of operators.} To +this end, we develop an efficient algorithm for finding the stochastic +transport map between functional domains and provide theoretical guarantees on +the existence, uniqueness, and consistency of our estimate for the +Hilbert-Schmidt operator. We validate our method on synthetic datasets and +examine the functional properties of the transport map. Experiments on +real-world datasets of robot arm trajectories further demonstrate the +effectiveness of our method on applications in domain adaptation. + +
+
+ comment: 48 pages, 10 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Revolutionizing Genomics with Reinforcement Learning Techniques + + +
+ In recent years, Reinforcement Learning (RL) has emerged as a powerful tool +for solving a wide range of problems, including decision-making and genomics. +The exponential growth of raw genomic data over the past two decades has +exceeded the capacity of manual analysis, leading to a growing interest in +automatic data analysis and processing. RL algorithms are capable of learning +from experience with minimal human supervision, making them well-suited for +genomic data analysis and interpretation. One of the key benefits of using RL +is the reduced cost associated with collecting labeled training data, which is +required for supervised learning. While there have been numerous studies +examining the applications of Machine Learning (ML) in genomics, this survey +focuses exclusively on the use of RL in various genomics research fields, +including gene regulatory networks (GRNs), genome assembly, and sequence +alignment. We present a comprehensive technical overview of existing studies on +the application of RL in genomics, highlighting the strengths and limitations +of these approaches. We then discuss potential research directions that are +worthy of future exploration, including the development of more sophisticated +reward functions as RL heavily depends on the accuracy of the reward function, +the integration of RL with other machine learning techniques, and the +application of RL to new and emerging areas in genomics research. Finally, we +present our findings and conclude by summarizing the current state of the field +and the future outlook for RL in genomics. + +
+
+
+
+
+ + ♻ ☆ On Formal Feature Attribution and Its Approximation + + +
+ Recent years have witnessed the widespread use of artificial intelligence +(AI) algorithms and machine learning (ML) models. Despite their tremendous +success, a number of vital problems like ML model brittleness, their fairness, +and the lack of interpretability warrant the need for the active developments +in explainable artificial intelligence (XAI) and formal ML model verification. +The two major lines of work in XAI include feature selection methods, e.g. +Anchors, and feature attribution techniques, e.g. LIME and SHAP. Despite their +promise, most of the existing feature selection and attribution approaches are +susceptible to a range of critical issues, including explanation unsoundness +and out-of-distribution sampling. A recent formal approach to XAI (FXAI) +although serving as an alternative to the above and free of these issues +suffers from a few other limitations. For instance and besides the scalability +limitation, the formal approach is unable to tackle the feature attribution +problem. Additionally, a formal explanation despite being formally sound is +typically quite large, which hampers its applicability in practical settings. +Motivated by the above, this paper proposes a way to apply the apparatus of +formal XAI to the case of feature attribution based on formal explanation +enumeration. Formal feature attribution (FFA) is argued to be advantageous over +the existing methods, both formal and non-formal. Given the practical +complexity of the problem, the paper then proposes an efficient technique for +approximating exact FFA. Finally, it offers experimental evidence of the +effectiveness of the proposed approximate FFA in comparison to the existing +feature attribution algorithms not only in terms of feature importance and but +also in terms of their relative order. + +
+
+
+
+
+ + ♻ ☆ Continuous-Time User Preference Modelling for Temporal Sets Prediction + + +
+ Given a sequence of sets, where each set has a timestamp and contains an +arbitrary number of elements, temporal sets prediction aims to predict the +elements in the subsequent set. Previous studies for temporal sets prediction +mainly focus on the modelling of elements and implicitly represent each user's +preference based on his/her interacted elements. However, user preferences are +often continuously evolving and the evolutionary trend cannot be fully captured +with the indirect learning paradigm of user preferences. To this end, we +propose a continuous-time user preference modelling framework for temporal sets +prediction, which explicitly models the evolving preference of each user by +maintaining a memory bank to store the states of all the users and elements. +Specifically, we first construct a universal sequence by arranging all the +user-set interactions in a non-descending temporal order, and then +chronologically learn from each user-set interaction. For each interaction, we +continuously update the memories of the related user and elements based on +their currently encoded messages and past memories. Moreover, we present a +personalized user behavior learning module to discover user-specific +characteristics based on each user's historical sequence, which aggregates the +previously interacted elements from dual perspectives according to the user and +elements. Finally, we develop a set-batch algorithm to improve the model +efficiency, which can create time-consistent batches in advance and achieve +3.5x and 3.0x speedups in the training and evaluation process on average. +Experiments on four real-world datasets demonstrate the superiority of our +approach over state-of-the-arts under both transductive and inductive settings. +The good interpretability of our method is also shown. + +
+
+ comment: Accepted by the TKDE journal +
+
+
+
+
+ + ♻ ☆ Symmetry-Preserving Program Representations for Learning Code Semantics + + +
+ Large Language Models (LLMs) have shown promise in automated program +reasoning, a crucial aspect of many security tasks. However, existing LLM +architectures for code are often borrowed from other domains like natural +language processing, raising concerns about their generalization and robustness +to unseen code. A key generalization challenge is to incorporate the knowledge +of code semantics, including control and data flow, into the LLM architectures. + Drawing inspiration from examples of convolution layers exploiting +translation symmetry, we explore how code symmetries can enhance LLM +architectures for program analysis and modeling. We present a rigorous +group-theoretic framework that formally defines code symmetries as +semantics-preserving transformations and provides techniques for precisely +reasoning about symmetry preservation within LLM architectures. Using this +framework, we introduce a novel variant of self-attention that preserves +program symmetries, demonstrating its effectiveness in generalization and +robustness through detailed experimental evaluations across different binary +and source code analysis tasks. Overall, our code symmetry framework offers +rigorous and powerful reasoning techniques that can guide the future +development of specialized LLMs for code and advance LLM-guided program +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ One-shot Ultra-high-Resolution Generative Adversarial Network That + Synthesizes 16K Images On A Single GPU + + +
+ We propose a one-shot ultra-high-resolution generative adversarial network +(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images +from a single training image and is trainable on a single consumer GPU. OUR-GAN +generates an initial image that is visually plausible and varied in shape at +low resolution, and then gradually increases the resolution by adding detail +through super-resolution. Since OUR-GAN learns from a real +ultra-high-resolution (UHR) image, it can synthesize large shapes with fine +details and long-range coherence, which is difficult to achieve with +conventional generative models that rely on the patch distribution learned from +relatively small images. OUR-GAN can synthesize high-quality 16K images with +12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR +image part by part through seamless subregion-wise super-resolution. +Additionally, OUR-GAN improves visual coherence while maintaining diversity by +applying vertical positional convolution. In experiments on the ST4K and RAISE +datasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity +compared with the baseline one-shot synthesis models. To the best of our +knowledge, OUR-GAN is the first one-shot image synthesizer that generates +non-repetitive UHR images on a single consumer GPU. The synthesized image +samples are presented at https://our-gan.github.io. + +
+
+ comment: 36 pages, 26 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+
+
+
+ + ♻ ☆ A Low Latency Adaptive Coding Spiking Framework for Deep Reinforcement + Learning + + +
+ In recent years, spiking neural networks (SNNs) have been used in +reinforcement learning (RL) due to their low power consumption and event-driven +features. However, spiking reinforcement learning (SRL), which suffers from +fixed coding methods, still faces the problems of high latency and poor +versatility. In this paper, we use learnable matrix multiplication to encode +and decode spikes, improving the flexibility of the coders and thus reducing +latency. Meanwhile, we train the SNNs using the direct training method and use +two different structures for online and offline RL algorithms, which gives our +model a wider range of applications. Extensive experiments have revealed that +our method achieves optimal performance with ultra-low latency (as low as 0.8% +of other SRL methods) and excellent energy efficiency (up to 5X the DNNs) in +different algorithms and different environments. + +
+
+
+
+
+ + ♻ ☆ Near-Optimal Nonconvex-Strongly-Convex Bilevel Optimization with Fully + First-Order Oracles + + +
+ Bilevel optimization has wide applications such as hyperparameter tuning, +neural architecture search, and meta-learning. Designing efficient algorithms +for bilevel optimization is challenging because the lower-level problem defines +a feasibility set implicitly via another optimization problem. In this work, we +consider one tractable case when the lower-level problem is strongly convex. +Recent works show that with a Hessian-vector product oracle, one can provably +find an $\epsilon$-first-order stationary point within +$\tilde{\mathcal{O}}(\epsilon^{-2})$ oracle calls. However, Hessian-vector +product may be inaccessible or expensive in practice. Kwon et al. (ICML 2023) +addressed this issue by proposing a first-order method that can achieve the +same goal at a slower rate of $\tilde{\mathcal{O}}(\epsilon^{-3})$. In this +work, we provide a tighter analysis demonstrating that this method can converge +at the near-optimal $\tilde {\mathcal{O}}(\epsilon^{-2})$ rate as second-order +methods. Our analysis further leads to simple first-order algorithms that +achieve similar convergence rates for finding second-order stationary points +and for distributed bilevel problems. + +
+
+ comment: slightly change the title +
+
+
+
+
+ + ♻ ☆ Federated Linear Bandit Learning via Over-the-Air Computation + + +
+ In this paper, we investigate federated contextual linear bandit learning +within a wireless system that comprises a server and multiple devices. Each +device interacts with the environment, selects an action based on the received +reward, and sends model updates to the server. The primary objective is to +minimize cumulative regret across all devices within a finite time horizon. To +reduce the communication overhead, devices communicate with the server via +over-the-air computation (AirComp) over noisy fading channels, where the +channel noise may distort the signals. In this context, we propose a customized +federated linear bandits scheme, where each device transmits an analog signal, +and the server receives a superposition of these signals distorted by channel +noise. A rigorous mathematical analysis is conducted to determine the regret +bound of the proposed scheme. Both theoretical analysis and numerical +experiments demonstrate the competitive performance of our proposed scheme in +terms of regret bounds in various settings. + +
+
+
+
+
+ + ♻ ☆ Incomplete Multi-View Weak-Label Learning + + +
+ A variety of modern applications exhibit multi-view multi-label learning, +where each sample has multi-view features, and multiple labels are correlated +via common views. Current methods usually fail to directly deal with the +setting where only a subset of features and labels are observed for each +sample, and ignore the presence of noisy views and imbalanced labels in +real-world problems. In this paper, we propose a novel method to overcome the +limitations. It jointly embeds incomplete views and weak labels into a +low-dimensional subspace with adaptive weights, and facilitates the difference +between embedding weight matrices via auto-weighted Hilbert-Schmidt +Independence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively +learns view-wise importance for embedding to detect noisy views, and mitigates +the label imbalance problem by focal loss. Experimental results on four +real-world multi-view multi-label datasets demonstrate the effectiveness of the +proposed method. + +
+
+ comment: 6 pages, 2 figures, conference +
+
+
+
+
+ + ♻ ☆ ProAgent: Building Proactive Cooperative AI with Large Language Models + + +
+ Building AIs with adaptive behaviors in human-AI cooperation stands as a +pivotal focus in AGI research. Current methods for developing cooperative +agents predominantly rely on learning-based methods, where policy +generalization heavily hinges on past interactions with specific teammates. +These approaches constrain the agent's capacity to recalibrate its strategy +when confronted with novel teammates. We propose \textbf{ProAgent}, a novel +framework that harnesses large language models (LLMs) to fashion a +\textit{pro}active \textit{agent} empowered with the ability to anticipate +teammates' forthcoming decisions and formulate enhanced plans for itself. +ProAgent excels at cooperative reasoning with the capacity to dynamically adapt +its behavior to enhance collaborative efforts with teammates. Moreover, the +ProAgent framework exhibits a high degree of modularity and interpretability, +facilitating seamless integration to address a wide array of coordination +scenarios. Experimental evaluations conducted within the framework of +\textit{Overcook-AI} unveil the remarkable performance superiority of ProAgent, +outperforming five methods based on self-play and population-based training in +cooperation with AI agents. Further, when cooperating with human proxy models, +its performance exhibits an average improvement exceeding 10\% compared to the +current state-of-the-art, COLE. The advancement was consistently observed +across diverse scenarios involving interactions with both AI agents of varying +characteristics and human counterparts. These findings inspire future research +for human-robot collaborations. For a hands-on demonstration, please visit +\url{https://pku-proagent.github.io}. + +
+
+
+
+
+ + ♻ ☆ BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning + + +
+ This paper introduces BarlowRL, a data-efficient reinforcement learning agent +that combines the Barlow Twins self-supervised learning framework with DER +(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its +contrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids +dimensional collapse by enforcing information spread to the whole space. This +helps RL algorithms to utilize uniformly spread state representation that +eventually results in a remarkable performance. The integration of Barlow Twins +with DER enhances data efficiency and achieves superior performance in the RL +tasks. BarlowRL demonstrates the potential of incorporating self-supervised +learning techniques to improve RL algorithms. + +
+
+
+
+
+ + ♻ ☆ QNet: A Quantum-native Sequence Encoder Architecture + + +
+ This work proposes QNet, a novel sequence encoder model that entirely +inferences on the quantum computer using a minimum number of qubits. Let $n$ +and $d$ represent the length of the sequence and the embedding size, +respectively. The dot-product attention mechanism requires a time complexity of +$O(n^2 \cdot d)$, while QNet has merely $O(n+d)$ quantum circuit depth. In +addition, we introduce ResQNet, a quantum-classical hybrid model composed of +several QNet blocks linked by residual connections, as an isomorph Transformer +Encoder. We evaluated our work on various natural language processing tasks, +including text classification, rating score prediction, and named entity +recognition. Our models exhibit compelling performance over classical +state-of-the-art models with a thousand times fewer parameters. In summary, +this work investigates the advantage of machine learning on near-term quantum +computers in sequential data by experimenting with natural language processing +tasks. + +
+
+ comment: QCE23: 2023 IEEE International Conference on Quantum Computing & + Engineering +
+
+
+
+
+ + ♻ ☆ Bayesian low-rank adaptation for large language models + + +
+ Parameter-efficient fine-tuning (PEFT) has emerged as a new paradigm for +cost-efficient fine-tuning of large language models (LLMs), with low-rank +adaptation (LoRA) being a widely adopted choice. However, fine-tuned LLMs often +become overconfident especially when fine-tuned on small datasets. Bayesian +methods, with their inherent ability to estimate uncertainty, serve as potent +tools to mitigate overconfidence and enhance calibration. In this work, we +introduce Laplace-LoRA, a straightforward yet effective Bayesian method, which +applies the Laplace approximation to the LoRA parameters and, considerably +boosts the calibration of fine-tuned LLMs. + +
+
+
+
+
+ + ♻ ☆ Energy Management of Multi-mode Plug-in Hybrid Electric Vehicle using + Multi-agent Deep Reinforcement Learning + + +
+ The recently emerging multi-mode plug-in hybrid electric vehicle (PHEV) +technology is one of the pathways making contributions to decarbonization, and +its energy management requires multiple-input and multipleoutput (MIMO) +control. At the present, the existing methods usually decouple the MIMO control +into singleoutput (MISO) control and can only achieve its local optimal +performance. To optimize the multi-mode vehicle globally, this paper studies a +MIMO control method for energy management of the multi-mode PHEV based on +multi-agent deep reinforcement learning (MADRL). By introducing a relevance +ratio, a hand-shaking strategy is proposed to enable two learning agents to +work collaboratively under the MADRL framework using the deep deterministic +policy gradient (DDPG) algorithm. Unified settings for the DDPG agents are +obtained through a sensitivity analysis of the influencing factors to the +learning performance. The optimal working mode for the hand-shaking strategy is +attained through a parametric study on the relevance ratio. The advantage of +the proposed energy management method is demonstrated on a software-in-the-loop +testing platform. The result of the study indicates that the learning rate of +the DDPG agents is the greatest influencing factor for learning performance. +Using the unified DDPG settings and a relevance ratio of 0.2, the proposed +MADRL system can save up to 4% energy compared to the single-agent learning +system and up to 23.54% energy compared to the conventional rule-based system. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Sampling from Rashomon Sets, and the Universality + of Langevin Diffusion for Convex Optimization COLT 2023 + + +
+ In this paper we provide an algorithmic framework based on Langevin diffusion +(LD) and its corresponding discretizations that allow us to simultaneously +obtain: i) An algorithm for sampling from the exponential mechanism, whose +privacy analysis does not depend on convexity and which can be stopped at +anytime without compromising privacy, and ii) tight uniform stability +guarantees for the exponential mechanism. As a direct consequence, we obtain +optimal excess empirical and population risk guarantees for (strongly) convex +losses under both pure and approximate differential privacy (DP). The framework +allows us to design a DP uniform sampler from the Rashomon set. Rashomon sets +are widely used in interpretable and robust machine learning, understanding +variable importance, and characterizing fairness. + +
+
+ comment: Appeared in COLT 2023. For ease of presentation, some results appear + in the previous version of this paper on arXiv (v3) that do not appear in + this version, nor are subsumed by results in this version. Please see Section + 1.4 for more details +
+
+
+
+
+ + ♻ ☆ Principles and Guidelines for Evaluating Social Robot Navigation + Algorithms + + +
+ A major challenge to deploying robots widely is navigation in human-populated +environments, commonly referred to as social robot navigation. While the field +of social navigation has advanced tremendously in recent years, the fair +evaluation of algorithms that tackle social navigation remains hard because it +involves not just robotic agents moving in static environments but also dynamic +human agents and their perceptions of the appropriateness of robot behavior. In +contrast, clear, repeatable, and accessible benchmarks have accelerated +progress in fields like computer vision, natural language processing and +traditional robot navigation by enabling researchers to fairly compare +algorithms, revealing limitations of existing solutions and illuminating +promising new directions. We believe the same approach can benefit social +navigation. In this paper, we pave the road towards common, widely accessible, +and repeatable benchmarking criteria to evaluate social robot navigation. Our +contributions include (a) a definition of a socially navigating robot as one +that respects the principles of safety, comfort, legibility, politeness, social +competency, agent understanding, proactivity, and responsiveness to context, +(b) guidelines for the use of metrics, development of scenarios, benchmarks, +datasets, and simulators to evaluate social navigation, and (c) a design of a +social navigation metrics framework to make it easier to compare results from +different simulators, robots and datasets. + +
+
+ comment: 42 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Scissorhands: Exploiting the Persistence of Importance Hypothesis for + LLM KV Cache Compression at Test Time + + +
+ Large language models(LLMs) have sparked a new wave of exciting AI +applications. Hosting these models at scale requires significant memory +resources. One crucial memory bottleneck for the deployment stems from the +context window. It is commonly recognized that model weights are memory hungry; +however, the size of key-value embedding stored during the generation process +(KV cache) can easily surpass the model size. The enormous size of the KV cache +puts constraints on the inference batch size, which is crucial for high +throughput inference workload. Inspired by an interesting observation of the +attention scores, we hypothesize the persistence of importance: only pivotal +tokens, which had a substantial influence at one step, will significantly +influence future generations. Based on our empirical verification and +theoretical analysis around this hypothesis, we propose Scissorhands, a system +that maintains the memory usage of the KV cache at a fixed budget without +finetuning the model. In essence, Scissorhands manages the KV cache by storing +the pivotal tokens with a higher probability. We validate that Scissorhands +reduces the inference memory usage of the KV cache by up to 5X without +compromising model quality. We further demonstrate that Scissorhands can be +combined with 4-bit quantization, traditionally used to compress model weights, +to achieve up to 20X compression. + +
+
+
+
+
+ + ♻ ☆ PRANC: Pseudo RAndom Networks for Compacting deep models + + +
+ We demonstrate that a deep model can be reparametrized as a linear +combination of several randomly initialized and frozen deep models in the +weight space. During training, we seek local minima that reside within the +subspace spanned by these random models (i.e., `basis' networks). Our +framework, PRANC, enables significant compaction of a deep model. The model can +be reconstructed using a single scalar `seed,' employed to generate the +pseudo-random `basis' networks, together with the learned linear mixture +coefficients. + In practical applications, PRANC addresses the challenge of efficiently +storing and communicating deep models, a common bottleneck in several +scenarios, including multi-agent learning, continual learners, federated +systems, and edge devices, among others. In this study, we employ PRANC to +condense image classification models and compress images by compacting their +associated implicit neural networks. PRANC outperforms baselines with a large +margin on image classification when compressing a deep model almost $100$ +times. Moreover, we show that PRANC enables memory-efficient inference by +generating layer-wise weights on the fly. The source code of PRANC is here: +\url{https://github.com/UCDvision/PRANC} + +
+
+
+
+
+ + ♻ ☆ Discovery and Exploitation of Generalized Network Effects + + +
+ Given a large graph with few node labels, how can we (a) identify whether +there is generalized network-effects (GNE) of the graph or not, (b) estimate +GNE to explain the interrelations among node classes, and (c) exploit GNE to +improve downstream tasks such as predicting the unknown labels accurately and +efficiently? The knowledge of GNE is valuable for various tasks like node +classification and targeted advertising. However, identifying and understanding +GNE such as homophily, heterophily or their combination is challenging in +real-world graphs due to limited availability of node labels and noisy edges. +We propose NetEffect, a graph mining approach to address the above issues, +enjoying the following properties: (i) Principled: a statistical test to +determine the presence of GNE in a graph with few node labels; (ii) General and +Explainable: a closed-form solution to estimate the specific type of GNE +observed; and (iii) Accurate and Scalable: the integration of GNE for accurate +and fast node classification. Applied on public, real-world graphs, NetEffect +discovers the unexpected absence of GNE in numerous graphs, which previously +thought to exhibit heterophily. Further, we show that incorporating GNE is +effective on node classification. On a large real-world graph with 1.6M nodes +and 22.3M edges, NetEffect achieves over 7 times speedup (14 minutes vs. 2 +hours) compared to most competitors. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ♻ ☆ Mobilizing Personalized Federated Learning in Infrastructure-Less and + Heterogeneous Environments via Random Walk Stochastic ADMM + + +
+ This paper explores the challenges of implementing Federated Learning (FL) in +practical scenarios featuring isolated nodes with data heterogeneity, which can +only be connected to the server through wireless links in an +infrastructure-less environment. To overcome these challenges, we propose a +novel mobilizing personalized FL approach, which aims to facilitate mobility +and resilience. Specifically, we develop a novel optimization algorithm called +Random Walk Stochastic Alternating Direction Method of Multipliers (RWSADMM). +RWSADMM capitalizes on the server's random movement toward clients and +formulates local proximity among their adjacent clients based on hard +inequality constraints rather than requiring consensus updates or introducing +bias via regularization methods. To mitigate the computational burden on the +clients, an efficient stochastic solver of the approximated optimization +problem is designed in RWSADMM, which provably converges to the stationary +point almost surely in expectation. Our theoretical and empirical results +demonstrate the provable fast convergence and substantial accuracy improvements +achieved by RWSADMM compared to baseline methods, along with its benefits of +reduced communication costs and enhanced scalability. + +
+
+ comment: 28 pages, 7 figures, 3 tables, 1 algorithm. Proof details are + provided in the main body of the paper +
+
+
+
+
+ + ♻ ☆ When Do Annotator Demographics Matter? Measuring the Influence of + Annotator Demographics with the POPQUORN Dataset + + +
+ Annotators are not fungible. Their demographics, life experiences, and +backgrounds all contribute to how they label data. However, NLP has only +recently considered how annotator identity might influence their decisions. +Here, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering, +Offensiveness, text Rewriting, and politeness rating with demographic Nuance). +POPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a +representative sample regarding sex, age, and race as the US population. +Through a series of analyses, we show that annotators' background plays a +significant role in their judgments. Further, our work shows that backgrounds +not previously considered in NLP (e.g., education), are meaningful and should +be considered. Our study suggests that understanding the background of +annotators and collecting labels from a demographically balanced pool of crowd +workers is important to reduce the bias of datasets. The dataset, annotator +background, and annotation interface are available at +https://github.com/Jiaxin-Pei/potato-prolific-dataset . + +
+
+
+
+
+ + ♻ ☆ Second-order Conditional Gradient Sliding + + +
+ Constrained second-order convex optimization algorithms are the method of +choice when a high accuracy solution to a problem is needed, due to their local +quadratic convergence. These algorithms require the solution of a constrained +quadratic subproblem at every iteration. We present the \emph{Second-Order +Conditional Gradient Sliding} (SOCGS) algorithm, which uses a projection-free +algorithm to solve the constrained quadratic subproblems inexactly. When the +feasible region is a polytope the algorithm converges quadratically in primal +gap after a finite number of linearly convergent iterations. Once in the +quadratic regime the SOCGS algorithm requires $\mathcal{O}(\log(\log +1/\varepsilon))$ first-order and Hessian oracle calls and $\mathcal{O}(\log +(1/\varepsilon) \log(\log1/\varepsilon))$ linear minimization oracle calls to +achieve an $\varepsilon$-optimal solution. This algorithm is useful when the +feasible region can only be accessed efficiently through a linear optimization +oracle, and computing first-order information of the function, although +possible, is costly. + +
+
+
+
+
+ + ♻ ☆ Theoretical Guarantees of Learning Ensembling Strategies with + Applications to Time Series Forecasting ICML 2023 + + +
+ Ensembling is among the most popular tools in machine learning (ML) due to +its effectiveness in minimizing variance and thus improving generalization. +Most ensembling methods for black-box base learners fall under the umbrella of +"stacked generalization," namely training an ML algorithm that takes the +inferences from the base learners as input. While stacking has been widely +applied in practice, its theoretical properties are poorly understood. In this +paper, we prove a novel result, showing that choosing the best stacked +generalization from a (finite or finite-dimensional) family of stacked +generalizations based on cross-validated performance does not perform "much +worse" than the oracle best. Our result strengthens and significantly extends +the results in Van der Laan et al. (2007). Inspired by the theoretical +analysis, we further propose a particular family of stacked generalizations in +the context of probabilistic forecasting, each one with a different sensitivity +for how much the ensemble weights are allowed to vary across items, timestamps +in the forecast horizon, and quantiles. Experimental results demonstrate the +performance gain of the proposed method. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ♻ ☆ When to Show a Suggestion? Integrating Human Feedback in AI-Assisted + Programming + + +
+ AI powered code-recommendation systems, such as Copilot and CodeWhisperer, +provide code suggestions inside a programmer's environment (e.g., an IDE) with +the aim to improve their productivity. Since, in these scenarios, programmers +accept and reject suggestions, ideally, such a system should use this feedback +in furtherance of this goal. In this work, we leverage prior data of +programmers interacting with GitHub Copilot, a system used by millions of +programmers, to develop interventions that can save programmer time. We propose +a utility theory framework, which models this interaction with programmers and +decides which suggestions to display. Our framework Conditional suggestion +Display from Human Feedback (CDHF), relies on a cascade of models that predict +suggestion acceptance to selectively hide suggestions reducing both latency and +programmer verification time. Using data from 535 programmers, we perform a +retrospective evaluation of CDHF and show that we can avoid displaying a +significant fraction of suggestions that would have been rejected doing so +without total knowledge of the suggestions themselves. We further demonstrate +the importance of incorporating the programmer's latent unobserved state in +deciding when to display suggestions through ablations on user study data. +Finally, we showcase that using suggestion acceptance as a reward signal to +know which suggestions to display leads to reduced quality suggestions +indicating an unexpected pitfall. + +
+
+ comment: Previous version of these results can be found in arXiv:2210.14306 +
+
+
+
+
+ + ♻ ☆ Blockwise Parallel Transformer for Large Context Models + + +
+ Transformers have emerged as the cornerstone of state-of-the-art natural +language processing models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands posed by the +self-attention mechanism and the large feedforward network in Transformers +limit their ability to handle long sequences, thereby creating challenges for +tasks involving multiple long sequences or long-term dependencies. We present a +distinct approach, Blockwise Parallel Transformer (BPT), that leverages +blockwise computation of self-attention and feedforward network fusion to +minimize memory costs. By processing longer input sequences while maintaining +memory efficiency, BPT enables training sequences 32 times longer than vanilla +Transformers and up to 4 times longer than previous memory-efficient methods. +Extensive experiments on language modeling and reinforcement learning tasks +demonstrate the effectiveness of BPT in reducing memory requirements and +improving performance. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Towards enabling reliable immersive teleoperation through Digital Twin: + A UAV command and control use case + + +
+ This paper addresses the challenging problem of enabling reliable immersive +teleoperation in scenarios where an Unmanned Aerial Vehicle (UAV) is remotely +controlled by an operator via a cellular network. Such scenarios can be quite +critical particularly when the UAV lacks advanced equipment (e.g., Lidar-based +auto stop) or when the network is subject to some performance constraints +(e.g., delay). To tackle these challenges, we propose a novel architecture +leveraging Digital Twin (DT) technology to create a virtual representation of +the physical environment. This virtual environment accurately mirrors the +physical world, accounting for 3D surroundings, weather constraints, and +network limitations. To enhance teleoperation, the UAV in the virtual +environment is equipped with advanced features that maybe absent in the real +UAV. Furthermore, the proposed architecture introduces an intelligent logic +that utilizes information from both virtual and physical environments to +approve, deny, or correct actions initiated by the UAV operator. This +anticipatory approach helps to mitigate potential risks. Through a series of +field trials, we demonstrate the effectiveness of the proposed architecture in +significantly improving the reliability of UAV teleoperation. + +
+
+ comment: Accepted by IEEE Globecom 2023 +
+
+
+
+
+ + ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for + Temporal Forgery Localization ACM MM 2023 + + +
+ The emergence of artificial intelligence-generated content (AIGC) has raised +concerns about the authenticity of multimedia content in various fields. +However, existing research for forgery content detection has focused mainly on +binary classification tasks of complete videos, which has limited applicability +in industrial settings. To address this gap, we propose UMMAFormer, a novel +universal transformer framework for temporal forgery localization (TFL) that +predicts forgery segments with multimodal adaptation. Our approach introduces a +Temporal Feature Abnormal Attention (TFAA) module based on temporal feature +reconstruction to enhance the detection of temporal differences. We also design +a Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the +Feature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the +proposed method, we contribute a novel Temporal Video Inpainting Localization +(TVIL) dataset specifically tailored for video inpainting scenes. Our +experiments show that our approach achieves state-of-the-art performance on +benchmark datasets, including Lav-DF, TVIL, and Psynd, significantly +outperforming previous methods. The code and data are available at +https://github.com/ymhzyj/UMMAFormer/. + +
+
+ comment: 11 pages, 8 figures, 66 references. This paper has been accepted for + ACM MM 2023 +
+
+
+
+
+ + ☆ UniPT: Universal Parallel Tuning for Transfer Learning with Efficient + Parameter and Memory + + +
+ Fine-tuning pre-trained models has emerged as a powerful technique in +numerous domains, owing to its ability to leverage enormous pre-existing +knowledge and achieve remarkable performance on downstream tasks. However, +updating the parameters of entire networks is computationally intensive. +Although state-of-the-art parameter-efficient transfer learning (PETL) methods +significantly reduce the trainable parameters and storage demand, almost all of +them still need to back-propagate the gradients through large pre-trained +networks. This memory-extensive characteristic extremely limits the +applicability of PETL methods in real-world scenarios. To this end, we propose +a new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT). +Specifically, we facilitate the transfer process via a lightweight learnable +parallel network, which consists of two modules: 1) A parallel interaction +module that decouples the inherently sequential connections and processes the +intermediate activations detachedly of the pre-trained network. 2) A confidence +aggregation module that learns optimal strategies adaptively for integrating +cross-layer features. We evaluate UniPT with different backbones (e.g., +VSE$\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging +vision-and-language tasks (i.e., image-text retrieval, video-text retrieval, +visual question answering, compositional question answering, and visual +grounding). Extensive ablations on ten datasets have validated that our UniPT +can not only dramatically reduce memory consumption and outperform the best +memory-efficient competitor, but also achieve higher performance than existing +PETL methods in a low-memory scenario on different architectures. Our code is +publicly available at: https://github.com/Paranioar/UniPT. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Parameter-Efficient Transfer Learning for Audio-Visual-Language Tasks + + +
+ The pretrain-then-finetune paradigm has been widely used in various unimodal +and multimodal tasks. However, finetuning all the parameters of a pre-trained +model becomes prohibitive as the model size grows exponentially. To address +this issue, the adapter mechanism that freezes the pre-trained model and only +finetunes a few extra parameters is introduced and delivers promising results. +Most studies on adapter architectures are dedicated to unimodal or bimodal +tasks, while the adapter architectures for trimodal tasks have not been +investigated yet. This paper introduces a novel Long Short-Term Trimodal +Adapter (LSTTA) approach for video understanding tasks involving audio, visual, +and language modalities. Based on the pre-trained from the three modalities, +the designed adapter module is inserted between the sequential blocks to model +the dense interactions across the three modalities. Specifically, LSTTA +consists of two types of complementary adapter modules, namely the long-term +semantic filtering module and the short-term semantic interaction module. The +long-term semantic filtering aims to characterize the temporal importance of +the video frames and the short-term semantic interaction module models local +interactions within short periods. Compared to previous state-of-the-art +trimodal learning methods pre-trained on a large-scale trimodal corpus, LSTTA +is more flexible and can inherit any powerful unimodal or bimodal models. +Experimental results on four typical trimodal learning tasks show the +effectiveness of LSTTA over existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ Cross-Modal Retrieval: A Systematic Review of Methods and Future + Directions + + +
+ With the exponential surge in diverse multi-modal data, traditional uni-modal +retrieval methods struggle to meet the needs of users demanding access to data +from various modalities. To address this, cross-modal retrieval has emerged, +enabling interaction across modalities, facilitating semantic matching, and +leveraging complementarity and consistency between different modal data. +Although prior literature undertook a review of the cross-modal retrieval +field, it exhibits numerous deficiencies pertaining to timeliness, taxonomy, +and comprehensiveness. This paper conducts a comprehensive review of +cross-modal retrieval's evolution, spanning from shallow statistical analysis +techniques to vision-language pre-training models. Commencing with a +comprehensive taxonomy grounded in machine learning paradigms, mechanisms, and +models, the paper then delves deeply into the principles and architectures +underpinning existing cross-modal retrieval methods. Furthermore, it offers an +overview of widely used benchmarks, metrics, and performances. Lastly, the +paper probes the prospects and challenges that confront contemporary +cross-modal retrieval, while engaging in a discourse on potential directions +for further progress in the field. To facilitate the research on cross-modal +retrieval, we develop an open-source code repository at +https://github.com/BMC-SDNU/Cross-Modal-Retrieval. + +
+
+
+
+
+ + ♻ ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model + + +
+ Low light enhancement has gained increasing importance with the rapid +development of visual creation and editing. However, most existing enhancement +algorithms are designed to homogeneously increase the brightness of images to a +pre-defined extent, limiting the user experience. To address this issue, we +propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a +novel diffusion framework to provide users with rich controllability. Built +with a conditional diffusion model, we introduce an illumination embedding to +let users control their desired brightness level. Additionally, we incorporate +the Segment-Anything Model (SAM) to enable user-friendly region +controllability, where users can click on objects to specify the regions they +wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves +competitive performance regarding quantitative metrics, qualitative results, +and versatile controllability. Project page: +https://yuyangyin.github.io/CLEDiffusion/ + +
+
+ comment: Accepted In Proceedings of the 31st ACM International Conference on + Multimedia (MM' 23) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 19 + +
+
+
+ + ☆ Generations of Knowledge Graphs: The Crazy Ideas and the Business Impact + + +
+ Knowledge Graphs (KGs) have been used to support a wide range of +applications, from web search to personal assistant. In this paper, we describe +three generations of knowledge graphs: entity-based KGs, which have been +supporting general search and question answering (e.g., at Google and Bing); +text-rich KGs, which have been supporting search and recommendations for +products, bio-informatics, etc. (e.g., at Amazon and Alibaba); and the emerging +integration of KGs and LLMs, which we call dual neural KGs. We describe the +characteristics of each generation of KGs, the crazy ideas behind the scenes in +constructing such KGs, and the techniques developed over time to enable +industry impact. In addition, we use KGs as examples to demonstrate a recipe to +evolve research ideas from innovations to production practice, and then to the +next level of innovations, to advance both science and business. + +
+
+
+
+
+ + ☆ Symbolic and Language Agnostic Large Language Models + + +
+ We argue that the relative success of large language models (LLMs) is not a +reflection on the symbolic vs. subsymbolic debate but a reflection on employing +an appropriate strategy of bottom-up reverse engineering of language at scale. +However, due to the subsymbolic nature of these models whatever knowledge these +systems acquire about language will always be buried in millions of +microfeatures (weights) none of which is meaningful on its own. Moreover, and +due to their stochastic nature, these models will often fail in capturing +various inferential aspects that are prevalent in natural language. What we +suggest here is employing the successful bottom-up strategy in a symbolic +setting, producing symbolic, language agnostic and ontologically grounded large +language models. + +
+
+ comment: 4 pages - draft. arXiv admin note: substantial text overlap with + arXiv:2306.00017 +
+
+
+
+
+ + ☆ Empowering Cross-lingual Abilities of Instruction-tuned Large Language + Models by Translation-following demonstrations + + +
+ The language ability of Large Language Models (LLMs) is often unbalanced +towards English because of the imbalance in the distribution of the +pre-training data. This disparity is demanded in further fine-tuning and +affecting the cross-lingual abilities of LLMs. In this paper, we propose to +empower Instructiontuned LLMs (It-LLMs) in languages other than English by +building semantic alignment between them. Hence, we propose CrossAlpaca, an +It-LLM with cross-lingual instruction-following and Translation-following +demonstrations to improve semantic alignment between languages. We validate our +approach on the multilingual Question Answering (QA) benchmarks XQUAD and MLQA +and adapted versions of MMLU and BBH. Our models, tested over six different +languages, outperform the It-LLMs tuned on monolingual data. The final results +show that instruction tuning on non-English data is not enough and that +semantic alignment can be further improved by Translation-following +demonstrations. + +
+
+
+
+
+ + ☆ Generative AI for Business Strategy: Using Foundation Models to Create + Business Strategy Tools + + +
+ Generative models (foundation models) such as LLMs (large language models) +are having a large impact on multiple fields. In this work, we propose the use +of such models for business decision making. In particular, we combine +unstructured textual data sources (e.g., news data) with multiple foundation +models (namely, GPT4, transformer-based Named Entity Recognition (NER) models +and Entailment-based Zero-shot Classifiers (ZSC)) to derive IT (information +technology) artifacts in the form of a (sequence of) signed business networks. +We posit that such artifacts can inform business stakeholders about the state +of the market and their own positioning as well as provide quantitative +insights into improving their future outlook. + +
+
+
+
+
+ + ☆ Towards Vision-Language Mechanistic Interpretability: A Causal Tracing + Tool for BLIP ICCV 2023 + + +
+ Mechanistic interpretability seeks to understand the neural mechanisms that +enable specific behaviors in Large Language Models (LLMs) by leveraging +causality-based methods. While these approaches have identified neural circuits +that copy spans of text, capture factual knowledge, and more, they remain +unusable for multimodal models since adapting these tools to the +vision-language domain requires considerable architectural changes. In this +work, we adapt a unimodal causal tracing tool to BLIP to enable the study of +the neural mechanisms underlying image-conditioned text generation. We +demonstrate our approach on a visual question answering dataset, highlighting +the causal relevance of later layer representations for all tokens. +Furthermore, we release our BLIP causal tracing tool as open source to enable +further experimentation in vision-language mechanistic interpretability by the +community. Our code is available at +https://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability. + +
+
+ comment: Final version for 5th Workshop on Closing the Loop Between Vision and + Language (CLVL) @ ICCV 2023. 4 pages, 5 figures +
+
+
+
+
+ + ☆ Examining User-Friendly and Open-Sourced Large GPT Models: A Survey on + Language, Multimodal, and Scientific GPT Models + + +
+ Generative pre-trained transformer (GPT) models have revolutionized the field +of natural language processing (NLP) with remarkable performance in various +tasks and also extend their power to multimodal domains. Despite their success, +large GPT models like GPT-4 face inherent limitations such as considerable +size, high computational requirements, complex deployment processes, and closed +development loops. These constraints restrict their widespread adoption and +raise concerns regarding their responsible development and usage. The need for +user-friendly, relatively small, and open-sourced alternative GPT models arises +from the desire to overcome these limitations while retaining high performance. +In this survey paper, we provide an examination of alternative open-sourced +models of large GPTs, focusing on user-friendly and relatively small models +that facilitate easier deployment and accessibility. Through this extensive +survey, we aim to equip researchers, practitioners, and enthusiasts with a +thorough understanding of user-friendly and relatively small open-sourced +models of large GPTs, their current state, challenges, and future research +directions, inspiring the development of more efficient, accessible, and +versatile GPT models that cater to the broader scientific community and advance +the field of general artificial intelligence. The source contents are +continuously updating in https://github.com/GPT-Alternatives/gpt_alternatives. + +
+
+
+
+
+ + ☆ Detecting Language Model Attacks with Perplexity + + +
+ A novel hack involving Large Language Models (LLMs) has emerged, leveraging +adversarial suffixes to trick models into generating perilous responses. This +method has garnered considerable attention from reputable media outlets such as +the New York Times and Wired, thereby influencing public perception regarding +the security and safety of LLMs. In this study, we advocate the utilization of +perplexity as one of the means to recognize such potential attacks. The +underlying concept behind these hacks revolves around appending an unusually +constructed string of text to a harmful query that would otherwise be blocked. +This maneuver confuses the protective mechanisms and tricks the model into +generating a forbidden response. Such scenarios could result in providing +detailed instructions to a malicious user for constructing explosives or +orchestrating a bank heist. Our investigation demonstrates the feasibility of +employing perplexity, a prevalent natural language processing metric, to detect +these adversarial tactics before generating a forbidden response. By evaluating +the perplexity of queries with and without such adversarial suffixes using an +open-source LLM, we discovered that nearly 90 percent were above a perplexity +of 1000. This contrast underscores the efficacy of perplexity for detecting +this type of exploit. + +
+
+
+
+
+ + ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap +and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT CI without specific guidance. ChatGPT CI autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT CI offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ☆ Situated Natural Language Explanations ACL 2023 + + +
+ Natural language is among the most accessible tools for explaining decisions +to humans, and large pretrained language models (PLMs) have demonstrated +impressive abilities to generate coherent natural language explanations (NLE). +The existing NLE research perspectives do not take the audience into account. +An NLE can have high textual quality, but it might not accommodate audiences' +needs and preference. To address this limitation, we propose an alternative +perspective, situated NLE, including a situated generation framework and a +situated evaluation framework. On the generation side, we propose simple prompt +engineering methods that adapt the NLEs to situations. In human studies, the +annotators preferred the situated NLEs. On the evaluation side, we set up +automated evaluation scores in lexical, semantic, and pragmatic categories. The +scores can be used to select the most suitable prompts to generate NLEs. +Situated NLE provides a perspective to conduct further research on automatic +NLE generations. + +
+
+ comment: A previous version was presented in ACL 2023 NLRSE workshop +
+
+
+
+
+ + ☆ MedAlign: A Clinician-Generated Dataset for Instruction Following with + Electronic Medical Records + + +
+ The ability of large language models (LLMs) to follow natural language +instructions with human-level fluency suggests many opportunities in healthcare +to reduce administrative burden and improve quality of care. However, +evaluating LLMs on realistic text generation tasks for healthcare remains +challenging. Existing question answering datasets for electronic health record +(EHR) data fail to capture the complexity of information needs and +documentation burdens experienced by clinicians. To address these challenges, +we introduce MedAlign, a benchmark dataset of 983 natural language instructions +for EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes +clinician-written reference responses for 303 instructions, and provides 276 +longitudinal EHRs for grounding instruction-response pairs. We used MedAlign to +evaluate 6 general domain LLMs, having clinicians rank the accuracy and quality +of each LLM response. We found high error rates, ranging from 35% (GPT-4) to +68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k +context lengths for GPT-4. Finally, we report correlations between clinician +rankings and automated natural language generation metrics as a way to rank +LLMs without human review. We make MedAlign available under a research data use +agreement to enable LLM evaluations on tasks aligned with clinician needs and +preferences. + +
+
+
+
+
+ + ☆ An Analysis of On-the-fly Determinization of Finite-state Automata + + +
+ In this paper we establish an abstraction of on-the-fly determinization of +finite-state automata using transition monoids and demonstrate how it can be +applied to bound the asymptotics. We present algebraic and combinatorial +properties that are sufficient for a polynomial state complexity of the +deterministic automaton constructed on-the-fly. A special case of our findings +is that automata with many non-deterministic transitions almost always admit a +determinization of polynomial complexity. Furthermore, we extend our ideas to +weighted finite-state automata. + +
+
+
+
+
+ + ☆ Confucius: Iterative Tool Learning from Introspection Feedback by + Easy-to-Difficult Curriculum + + +
+ Augmenting large language models (LLMs) with external tools has emerged as a +promising approach to extending the capability of LLMs. Although some works +employ open-source LLMs for the tool learning task, most of them are trained in +a controlled environment in which LLMs only learn to execute the human-provided +tools. However, selecting proper tools from the large toolset is also a crucial +ability for the tool learning model to be applied in real-world applications. +Existing methods usually directly employ self-instruction methods to train the +model, which ignores differences in tool complexity. In this paper, we propose +the Confucius, a novel tool learning framework to train LLM to use complicated +tools in real-world scenarios, which contains two main phases: (1) We first +propose a multi-stage learning method to teach the LLM to use various tools +from an easy-to-difficult curriculum; (2) thenceforth, we propose the Iterative +Self-instruct from Introspective Feedback (ISIF) to dynamically construct the +dataset to improve the ability to use the complicated tool. Extensive +experiments conducted on both controlled and real-world settings demonstrate +the superiority of our tool learning framework in the real-world application +scenarios compared to both tuning-free (e.g. ChatGPT, Claude) and tuning-based +baselines (e.g. GPT4Tools). + +
+
+
+
+
+ + ☆ VoiceBank-2023: A Multi-Speaker Mandarin Speech Corpus for Constructing + Personalized TTS Systems for the Speech Impaired + + +
+ Services of personalized TTS systems for the Mandarin-speaking speech +impaired are rarely mentioned. Taiwan started the VoiceBanking project in 2020, +aiming to build a complete set of services to deliver personalized Mandarin TTS +systems to amyotrophic lateral sclerosis patients. This paper reports the +corpus design, corpus recording, data purging and correction for the corpus, +and evaluations of the developed personalized TTS systems, for the VoiceBanking +project. The developed corpus is named after the VoiceBank-2023 speech corpus +because of its release year. The corpus contains 29.78 hours of utterances with +prompts of short paragraphs and common phrases spoken by 111 native Mandarin +speakers. The corpus is labeled with information about gender, degree of speech +impairment, types of users, transcription, SNRs, and speaking rates. The +VoiceBank-2023 is available by request for non-commercial use and welcomes all +parties to join the VoiceBanking project to improve the services for the speech +impaired. + +
+
+ comment: submitted to 26th International Conference of the ORIENTAL-COCOSDA +
+
+
+
+
+ + ♻ ☆ A Study on Robustness and Reliability of Large Language Model Code + Generation + + +
+ Recently, the large language models (LLMs) have shown extraordinary ability +in understanding natural language and generating programming code. It has been +a common practice of software engineers to consult LLMs when encountering +coding questions. Although efforts have been made to avoid syntax errors and +align the code with the intended semantics, the reliability and robustness of +the code generationfrom LLMs have not yet been thoroughly studied. The +executable code is not equivalent to the reliable and robust code, especially +in the context of real-world software development. The misuse of APIs in the +generated code could lead to severe problem, such as resource leaks, program +crashes. To make things worse, the users of LLM code generation services are +actually the developers that are most vulnerable to these code that seems right +-- They are always novice developers that are not familiar with the APIs that +LLMs generate code for them. Therefore, they could hardly tell the misuse in +the code generated by LLMs, which further facilitates the incorrect code +applied in real-world software. Existing code evaluation benchmark and datasets +focus on crafting small tasks such as programming questions in coding +interviews, which however deviates from the problem that developers would ask +LLM for real-world coding help. To fill the missing piece, in this work, we +propose a dataset RobustAPI for evaluating the reliability and robustness of +code generated by LLMs. We collect 1208 coding questions from StackOverflow on +24 representative Java APIs. We summarize thecommon misuse patterns of these +APIs and evaluate them oncurrent popular LLMs. The evaluation results show that +evenfor GPT-4, 62% of the generated code contains API misuses,which would cause +unexpected consequences if the code isintroduced into real-world software. + +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ RestGPT: Connecting Large Language Models with Real-World RESTful APIs + + +
+ Tool-augmented large language models (LLMs) have achieved remarkable progress +in tackling a broad range of tasks. However, existing methods are mainly +restricted to specifically designed tools and fail to fulfill complex +instructions, having great limitations when confronted with real-world +scenarios. In this paper, we explore a more realistic scenario by connecting +LLMs with RESTful APIs, which adhere to the widely adopted REST software +architectural style for web service development. To address the practical +challenges of tackling complex instructions, we propose RestGPT, which exploits +the power of LLMs and conducts a coarse-to-fine online planning mechanism to +enhance the abilities of task decomposition and API selection. RestGPT also +contains an API executor tailored for calling RESTful APIs, which can +meticulously formulate parameters and parse API responses. To fully evaluate +the performance of RestGPT, we propose RestBench, a high-quality benchmark +which consists of two real-world scenarios and human-annotated instructions +with gold solution paths. Experiments show that RestGPT is able to achieve +impressive results in complex tasks and has strong robustness, which paves a +new way towards AGI. RestGPT and RestBench is publicly available at +https://restgpt.github.io/. + +
+
+ comment: Add RestBench to evaluate RestGPT +
+
+
+
+
+ + ♻ ☆ TwHIN-BERT: A Socially-Enriched Pre-trained Language Model for + Multilingual Tweet Representations at Twitter + + +
+ Pre-trained language models (PLMs) are fundamental for natural language +processing applications. Most existing PLMs are not tailored to the noisy +user-generated text on social media, and the pre-training does not factor in +the valuable social engagement logs available in a social network. We present +TwHIN-BERT, a multilingual language model productionized at Twitter, trained on +in-domain data from the popular social network. TwHIN-BERT differs from prior +pre-trained language models as it is trained with not only text-based +self-supervision, but also with a social objective based on the rich social +engagements within a Twitter heterogeneous information network (TwHIN). Our +model is trained on 7 billion tweets covering over 100 distinct languages, +providing a valuable representation to model short, noisy, user-generated text. +We evaluate our model on various multilingual social recommendation and +semantic understanding tasks and demonstrate significant metric improvement +over established pre-trained language models. We open-source TwHIN-BERT and our +curated hashtag prediction and social engagement benchmark datasets to the +research community. + +
+
+
+
+
+ + ♻ ☆ Event knowledge in large language models: the gap between the impossible + and the unlikely + + +
+ Word co-occurrence patterns in language corpora contain a surprising amount +of conceptual knowledge. Large language models (LLMs), trained to predict words +in context, leverage these patterns to achieve impressive performance on +diverse semantic tasks requiring world knowledge. An important but understudied +question about LLMs' semantic abilities is whether they acquire generalized +knowledge of common events. Here, we test whether five pre-trained LLMs (from +2018's BERT to 2023's MPT) assign higher likelihood to plausible descriptions +of agent-patient interactions than to minimally different implausible versions +of the same event. Using three curated sets of minimal sentence pairs (total +n=1,215), we found that pre-trained LLMs possess substantial event knowledge, +outperforming other distributional language models. In particular, they almost +always assign higher likelihood to possible vs. impossible events (The teacher +bought the laptop vs. The laptop bought the teacher). However, LLMs show less +consistent preferences for likely vs. unlikely events (The nanny tutored the +boy vs. The boy tutored the nanny). In follow-up analyses, we show that (i) LLM +scores are driven by both plausibility and surface-level sentence features, +(ii) LLM scores generalize well across syntactic variants (active vs. passive +constructions) but less well across semantic variants (synonymous sentences), +(iii) some LLM errors mirror human judgment ambiguity, and (iv) sentence +plausibility serves as an organizing dimension in internal LLM representations. +Overall, our results show that important aspects of event knowledge naturally +emerge from distributional linguistic patterns, but also highlight a gap +between representations of possible/impossible and likely/unlikely events. + +
+
+ comment: The two lead authors have contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Protecting Language Generation Models via Invisible Watermarking ICML 2023 + + +
+ Language generation models have been an increasingly powerful enabler for +many applications. Many such models offer free or affordable API access, which +makes them potentially vulnerable to model extraction attacks through +distillation. To protect intellectual property (IP) and ensure fair use of +these models, various techniques such as lexical watermarking and synonym +replacement have been proposed. However, these methods can be nullified by +obvious countermeasures such as "synonym randomization". To address this issue, +we propose GINSEW, a novel method to protect text generation models from being +stolen through distillation. The key idea of our method is to inject secret +signals into the probability vector of the decoding steps for each target +token. We can then detect the secret message by probing a suspect model to tell +if it is distilled from the protected one. Experimental results show that +GINSEW can effectively identify instances of IP infringement with minimal +impact on the generation quality of protected APIs. Our method demonstrates an +absolute improvement of 19 to 29 points on mean average precision (mAP) in +detecting suspects compared to previous methods against watermark removal +attacks. + +
+
+ comment: ICML 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 48 + +
+
+
+ + ☆ High-Resolution Document Shadow Removal via A Large-Scale Real-World + Dataset and A Frequency-Aware Shadow Erasing Net + + +
+ Shadows often occur when we capture the documents with casual equipment, +which influences the visual quality and readability of the digital copies. +Different from the algorithms for natural shadow removal, the algorithms in +document shadow removal need to preserve the details of fonts and figures in +high-resolution input. Previous works ignore this problem and remove the +shadows via approximate attention and small datasets, which might not work in +real-world situations. We handle high-resolution document shadow removal +directly via a larger-scale real-world dataset and a carefully designed +frequency-aware network. As for the dataset, we acquire over 7k couples of +high-resolution (2462 x 3699) images of real-world document pairs with various +samples under different lighting circumstances, which is 10 times larger than +existing datasets. As for the design of the network, we decouple the +high-resolution images in the frequency domain, where the low-frequency details +and high-frequency boundaries can be effectively learned via the carefully +designed network structure. Powered by our network and dataset, the proposed +method clearly shows a better performance than previous methods in terms of +visual quality and numerical results. The code, models, and dataset are +available at: https://github.com/CXH-Research/DocShadow-SD7K + +
+
+
+
+
+ + ☆ Post-Hoc Explainability of BI-RADS Descriptors in a Multi-task Framework + for Breast Cancer Detection and Segmentation SP + + +
+ Despite recent medical advancements, breast cancer remains one of the most +prevalent and deadly diseases among women. Although machine learning-based +Computer-Aided Diagnosis (CAD) systems have shown potential to assist +radiologists in analyzing medical images, the opaque nature of the +best-performing CAD systems has raised concerns about their trustworthiness and +interpretability. This paper proposes MT-BI-RADS, a novel explainable deep +learning approach for tumor detection in Breast Ultrasound (BUS) images. The +approach offers three levels of explanations to enable radiologists to +comprehend the decision-making process in predicting tumor malignancy. Firstly, +the proposed model outputs the BI-RADS categories used for BUS image analysis +by radiologists. Secondly, the model employs multi-task learning to +concurrently segment regions in images that correspond to tumors. Thirdly, the +proposed approach outputs quantified contributions of each BI-RADS descriptor +toward predicting the benign or malignant class using post-hoc explanations +with Shapley Values. + +
+
+ comment: 11 pages, 5 figures. Published at 2023 IEEE Workshop on MLSP +
+
+
+
+
+ + ☆ Exploring the Transfer Learning Capabilities of CLIP in Domain + Generalization for Diabetic Retinopathy + + +
+ Diabetic Retinopathy (DR), a leading cause of vision impairment, requires +early detection and treatment. Developing robust AI models for DR +classification holds substantial potential, but a key challenge is ensuring +their generalization in unfamiliar domains with varying data distributions. To +address this, our paper investigates cross-domain generalization, also known as +domain generalization (DG), within the context of DR classification. DG, a +challenging problem in the medical domain, is complicated by the difficulty of +gathering labeled data across different domains, such as patient demographics +and disease stages. Some recent studies have shown the effectiveness of using +CLIP to handle the DG problem in natural images. In this study, we investigate +CLIP's transfer learning capabilities and its potential for cross-domain +generalization in diabetic retinopathy (DR) classification. We carry out +comprehensive experiments to assess the efficacy and potential of CLIP in +addressing DG for DR classification. Further, we introduce a multi-modal +fine-tuning strategy named Context Optimization with Learnable Visual Tokens +(CoOpLVT), which enhances context optimization by conditioning on visual +features. Our findings demonstrate that the proposed method increases the +F1-score by 1.8% over the baseline, thus underlining its promise for effective +DG in DR classification. Our code is publicly available at +https://github.com/Sanoojan/CLIP-DRDG. + +
+
+
+
+
+ + ☆ SketchDreamer: Interactive Text-Augmented Creative Sketch Ideation BMVC 2023 + + +
+ Artificial Intelligence Generated Content (AIGC) has shown remarkable +progress in generating realistic images. However, in this paper, we take a step +"backward" and address AIGC for the most rudimentary visual modality of human +sketches. Our objective is on the creative nature of sketches, and that +creative sketching should take the form of an interactive process. We further +enable text to drive the sketch ideation process, allowing creativity to be +freely defined, while simultaneously tackling the challenge of "I can't +sketch". We present a method to generate controlled sketches using a +text-conditioned diffusion model trained on pixel representations of images. +Our proposed approach, referred to as SketchDreamer, integrates a +differentiable rasteriser of Bezier curves that optimises an initial input to +distil abstract semantic knowledge from a pretrained diffusion model. We +utilise Score Distillation Sampling to learn a sketch that aligns with a given +caption, which importantly enable both text and sketch to interact with the +ideation process. Our objective is to empower non-professional users to create +sketches and, through a series of optimisation processes, transform a narrative +into a storyboard by expanding the text prompt while making minor adjustments +to the sketch input. Through this work, we hope to aspire the way we create +visual content, democratise the creative process, and inspire further research +in enhancing human creativity in AIGC. The code is available at +\url{https://github.com/WinKawaks/SketchDreamer}. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ☆ Score-Based Generative Models for PET Image Reconstruction + + +
+ Score-based generative models have demonstrated highly promising results for +medical image reconstruction tasks in magnetic resonance imaging or computed +tomography. However, their application to Positron Emission Tomography (PET) is +still largely unexplored. PET image reconstruction involves a variety of +challenges, including Poisson noise with high variance and a wide dynamic +range. To address these challenges, we propose several PET-specific adaptations +of score-based generative models. The proposed framework is developed for both +2D and 3D PET. In addition, we provide an extension to guided reconstruction +using magnetic resonance images. We validate the approach through extensive 2D +and 3D $\textit{in-silico}$ experiments with a model trained on +patient-realistic data without lesions, and evaluate on data without lesions as +well as out-of-distribution data with lesions. This demonstrates the proposed +method's robustness and significant potential for improved PET reconstruction. + +
+
+ comment: 35 pages, 16 figures, submitted to Journal of Machine Learning for + Biomedical Imaging (MELBA) +
+
+
+
+
+ + ☆ Towards Vision-Language Mechanistic Interpretability: A Causal Tracing + Tool for BLIP ICCV 2023 + + +
+ Mechanistic interpretability seeks to understand the neural mechanisms that +enable specific behaviors in Large Language Models (LLMs) by leveraging +causality-based methods. While these approaches have identified neural circuits +that copy spans of text, capture factual knowledge, and more, they remain +unusable for multimodal models since adapting these tools to the +vision-language domain requires considerable architectural changes. In this +work, we adapt a unimodal causal tracing tool to BLIP to enable the study of +the neural mechanisms underlying image-conditioned text generation. We +demonstrate our approach on a visual question answering dataset, highlighting +the causal relevance of later layer representations for all tokens. +Furthermore, we release our BLIP causal tracing tool as open source to enable +further experimentation in vision-language mechanistic interpretability by the +community. Our code is available at +https://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability. + +
+
+ comment: Final version for 5th Workshop on Closing the Loop Between Vision and + Language (CLVL) @ ICCV 2023. 4 pages, 5 figures +
+
+
+
+
+ + ☆ AIGC for Various Data Modalities: A Survey + + +
+ AI-generated content (AIGC) methods aim to produce text, images, videos, 3D +assets, and other media using AI algorithms. Due to its wide range of +applications and the demonstrated potential of recent works, AIGC developments +have been attracting a lot of attention recently, and AIGC methods have been +developed for various data modalities, such as image, video, text, 3D shape (as +voxels, point clouds, meshes, and neural implicit fields), 3D scene, 3D human +avatar (body and head), 3D motion, and audio -- each presenting different +characteristics and challenges. Furthermore, there have also been many +significant developments in cross-modality AIGC methods, where generative +methods can receive conditioning input in one modality and produce outputs in +another. Examples include going from various modalities to image, video, 3D +shape, 3D scene, 3D avatar (body and head), 3D motion (skeleton and avatar), +and audio modalities. In this paper, we provide a comprehensive review of AIGC +methods across different data modalities, including both single-modal and +cross-modality methods, highlighting the various challenges, representative +works, and recent technical directions in each setting. We also present +comparative results on several benchmark datasets in various modalities. +Moreover, we also discuss the challenges and potential future research +directions. + +
+
+
+
+
+ + ☆ Intergrated Segmentation and Detection Models for Dentex Challenge 2023 + + +
+ Dental panoramic x-rays are commonly used in dental diagnosing. With the +development of deep learning, auto detection of diseases from dental panoramic +x-rays can help dentists to diagnose diseases more efficiently.The Dentex +Challenge 2023 is a competition for automatic detection of abnormal teeth along +with their enumeration ids from dental panoramic x-rays. In this paper, we +propose a method integrating segmentation and detection models to detect +abnormal teeth as well as obtain their enumeration ids.Our codes are available +at https://github.com/xyzlancehe/DentexSegAndDet. + +
+
+
+
+
+ + ☆ A Unified Transformer-based Network for multimodal Emotion Recognition + + +
+ The development of transformer-based models has resulted in significant +advances in addressing various vision and NLP-based research challenges. +However, the progress made in transformer-based methods has not been +effectively applied to biosensing research. This paper presents a novel Unified +Biosensor-Vision Multi-modal Transformer-based (UBVMT) method to classify +emotions in an arousal-valence space by combining a 2D representation of an +ECG/PPG signal with the face information. To achieve this goal, we first +investigate and compare the unimodal emotion recognition performance of three +image-based representations of the ECG/PPG signal. We then present our UBVMT +network which is trained to perform emotion recognition by combining the 2D +image-based representation of the ECG/PPG signal and the facial expression +features. Our unified transformer model consists of homogeneous transformer +blocks that take as an input the 2D representation of the ECG/PPG signal and +the corresponding face frame for emotion representation learning with minimal +modality-specific design. Our UBVMT model is trained by reconstructing masked +patches of video frames and 2D images of ECG/PPG signals, and contrastive +modeling to align face and ECG/PPG data. Extensive experiments on the +MAHNOB-HCI and DEAP datasets show that our Unified UBVMT-based model produces +comparable results to the state-of-the-art techniques. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Sparse Sampling Transformer with Uncertainty-Driven Ranking for Unified + Removal of Raindrops and Rain Streaks ICCV'23 + + +
+ In the real world, image degradations caused by rain often exhibit a +combination of rain streaks and raindrops, thereby increasing the challenges of +recovering the underlying clean image. Note that the rain streaks and raindrops +have diverse shapes, sizes, and locations in the captured image, and thus +modeling the correlation relationship between irregular degradations caused by +rain artifacts is a necessary prerequisite for image deraining. This paper aims +to present an efficient and flexible mechanism to learn and model degradation +relationships in a global view, thereby achieving a unified removal of +intricate rain scenes. To do so, we propose a Sparse Sampling Transformer based +on Uncertainty-Driven Ranking, dubbed UDR-S2Former. Compared to previous +methods, our UDR-S2Former has three merits. First, it can adaptively sample +relevant image degradation information to model underlying degradation +relationships. Second, explicit application of the uncertainty-driven ranking +strategy can facilitate the network to attend to degradation features and +understand the reconstruction process. Finally, experimental results show that +our UDR-S2Former clearly outperforms state-of-the-art methods for all +benchmarks. + +
+
+ comment: Accepted by ICCV'23 +
+
+
+
+
+ + ☆ Unaligned 2D to 3D Translation with Conditional Vector-Quantized Code + Diffusion using Transformers ICCV 2023 + + +
+ Generating 3D images of complex objects conditionally from a few 2D views is +a difficult synthesis problem, compounded by issues such as domain gap and +geometric misalignment. For instance, a unified framework such as Generative +Adversarial Networks cannot achieve this unless they explicitly define both a +domain-invariant and geometric-invariant joint latent distribution, whereas +Neural Radiance Fields are generally unable to handle both issues as they +optimize at the pixel level. By contrast, we propose a simple and novel 2D to +3D synthesis approach based on conditional diffusion with vector-quantized +codes. Operating in an information-rich code space enables high-resolution 3D +synthesis via full-coverage attention across the views. Specifically, we +generate the 3D codes (e.g. for CT images) conditional on previously generated +3D codes and the entire codebook of two 2D views (e.g. 2D X-rays). Qualitative +and quantitative results demonstrate state-of-the-art performance over +specialized methods across varied evaluation criteria, including fidelity +metrics such as density, coverage, and distortion metrics for two complex +volumetric imagery datasets from in real-world scenarios. + +
+
+ comment: Camera-ready version for ICCV 2023 +
+
+
+
+
+ + ☆ Cheap Lunch for Medical Image Segmentation by Fine-tuning SAM on Few + Exemplars MICCAI + + +
+ The Segment Anything Model (SAM) has demonstrated remarkable capabilities of +scaled-up segmentation models, enabling zero-shot generalization across a +variety of domains. By leveraging large-scale foundational models as +pre-trained models, it is a natural progression to fine-tune SAM for specific +domains to further enhance performances. However, the adoption of foundational +models in the medical domain presents a challenge due to the difficulty and +expense of labeling sufficient data for adaptation within hospital systems. In +this paper, we introduce an efficient and practical approach for fine-tuning +SAM using a limited number of exemplars, making it suitable for such scenarios. +Our approach combines two established techniques from the literature: an +exemplar-guided synthesis module and the widely recognized Low-Rank Adaptation +(LoRA) fine-tuning strategy, serving as data-level and model-level attempts +respectively. Interestingly, our empirical findings suggest that SAM can be +effectively aligned within the medical domain even with few labeled data. We +validate our approach through experiments on brain tumor segmentation (BraTS) +and multi-organ CT segmentation (Synapse). The comprehensive results underscore +the feasibility and effectiveness of such an approach, paving the way for the +practical application of SAM in the medical domain. + +
+
+ comment: Accepted by Brain Lesion (BrainLes) workshop of International + Conference on Medical Image Computing and Computer Assisted Intervention + (MICCAI BrainLes 2023). 10 pages, 3 figures +
+
+
+
+
+ + ☆ Synergizing Contrastive Learning and Optimal Transport for 3D Point + Cloud Domain Adaptation + + +
+ Recently, the fundamental problem of unsupervised domain adaptation (UDA) on +3D point clouds has been motivated by a wide variety of applications in +robotics, virtual reality, and scene understanding, to name a few. The point +cloud data acquisition procedures manifest themselves as significant domain +discrepancies and geometric variations among both similar and dissimilar +classes. The standard domain adaptation methods developed for images do not +directly translate to point cloud data because of their complex geometric +nature. To address this challenge, we leverage the idea of multimodality and +alignment between distributions. We propose a new UDA architecture for point +cloud classification that benefits from multimodal contrastive learning to get +better class separation in both domains individually. Further, the use of +optimal transport (OT) aims at learning source and target data distributions +jointly to reduce the cross-domain shift and provide a better alignment. We +conduct a comprehensive empirical study on PointDA-10 and GraspNetPC-10 and +show that our method achieves state-of-the-art performance on GraspNetPC-10 +(with approx 4-12% margin) and best average performance on PointDA-10. Our +ablation studies and decision boundary analysis also validate the significance +of our contrastive learning module and OT alignment. + +
+
+
+
+
+ + ☆ Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario + + +
+ Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to +improve model performance. Traditional SSL methods assume that labeled and +unlabeled data share the same label space. However, in real-world applications, +especially when the labeled training set is small, there may be classes that +are missing from the labeled set. Existing frameworks aim to either reject all +unseen classes (open-set SSL) or to discover unseen classes by partitioning an +unlabeled set during training (open-world SSL). In our work, we construct a +classifier for points from both seen and unseen classes. Our approach is based +on extending an existing SSL method, such as FlexMatch, by incorporating an +additional entropy loss. This enhancement allows our method to improve the +performance of any existing SSL method in the classification of both seen and +unseen classes. We demonstrate large improvement gains over state-of-the-art +SSL, open-set SSL, and open-world SSL methods, on two benchmark image +classification data sets, CIFAR-100 and STL-10. The gains are most pronounced +when the labeled data is severely limited (1-25 labeled examples per class). + +
+
+
+
+
+ + ☆ Semantic-aware Consistency Network for Cloth-changing Person + Re-Identification ACM MM 2023 + + +
+ Cloth-changing Person Re-Identification (CC-ReID) is a challenging task that +aims to retrieve the target person across multiple surveillance cameras when +clothing changes might happen. Despite recent progress in CC-ReID, existing +approaches are still hindered by the interference of clothing variations since +they lack effective constraints to keep the model consistently focused on +clothing-irrelevant regions. To address this issue, we present a Semantic-aware +Consistency Network (SCNet) to learn identity-related semantic features by +proposing effective consistency constraints. Specifically, we generate the +black-clothing image by erasing pixels in the clothing area, which explicitly +mitigates the interference from clothing variations. In addition, to fully +exploit the fine-grained identity information, a head-enhanced attention module +is introduced, which learns soft attention maps by utilizing the proposed +part-based matching loss to highlight head information. We further design a +semantic consistency loss to facilitate the learning of high-level +identity-related semantic features, forcing the model to focus on semantically +consistent cloth-irrelevant regions. By using the consistency constraint, our +model does not require any extra auxiliary segmentation module to generate the +black-clothing image or locate the head region during the inference stage. +Extensive experiments on four cloth-changing person Re-ID datasets (LTCC, PRCC, +Vc-Clothes, and DeepChange) demonstrate that our proposed SCNet makes +significant improvements over prior state-of-the-art approaches. Our code is +available at: https://github.com/Gpn-star/SCNet. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Depth self-supervision for single image novel view synthesis + + +
+ In this paper, we tackle the problem of generating a novel image from an +arbitrary viewpoint given a single frame as input. While existing methods +operating in this setup aim at predicting the target view depth map to guide +the synthesis, without explicit supervision over such a task, we jointly +optimize our framework for both novel view synthesis and depth estimation to +unleash the synergy between the two at its best. Specifically, a shared depth +decoder is trained in a self-supervised manner to predict depth maps that are +consistent across the source and target views. Our results demonstrate the +effectiveness of our approach in addressing the challenges of both tasks +allowing for higher-quality generated images, as well as more accurate depth +for the target viewpoint. + +
+
+
+
+
+ + ☆ Unified and Dynamic Graph for Temporal Character Grouping in Long Videos + + +
+ Video temporal character grouping locates appearing moments of major +characters within a video according to their identities. To this end, recent +works have evolved from unsupervised clustering to graph-based supervised +clustering. However, graph methods are built upon the premise of fixed affinity +graphs, bringing many inexact connections. Besides, they extract multi-modal +features with kinds of models, which are unfriendly to deployment. In this +paper, we present a unified and dynamic graph (UniDG) framework for temporal +character grouping. This is accomplished firstly by a unified representation +network that learns representations of multiple modalities within the same +space and still preserves the modality's uniqueness simultaneously. Secondly, +we present a dynamic graph clustering where the neighbors of different +quantities are dynamically constructed for each node via a cyclic matching +strategy, leading to a more reliable affinity graph. Thirdly, a progressive +association method is introduced to exploit spatial and temporal contexts among +different modalities, allowing multi-modal clustering results to be well fused. +As current datasets only provide pre-extracted features, we evaluate our UniDG +method on a collected dataset named MTCG, which contains each character's +appearing clips of face and body and speaking voice tracks. We also evaluate +our key components on existing clustering and retrieval datasets to verify the +generalization ability. Experimental results manifest that our method can +achieve promising results and outperform several state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Towards Unified Token Learning for Vision-Language Tracking + + +
+ In this paper, we present a simple, flexible and effective vision-language +(VL) tracking pipeline, termed \textbf{MMTrack}, which casts VL tracking as a +token generation task. Traditional paradigms address VL tracking task +indirectly with sophisticated prior designs, making them over-specialize on the +features of specific architectures or mechanisms. In contrast, our proposed +framework serializes language description and bounding box into a sequence of +discrete tokens. In this new design paradigm, all token queries are required to +perceive the desired target and directly predict spatial coordinates of the +target in an auto-regressive manner. The design without other prior modules +avoids multiple sub-tasks learning and hand-designed loss functions, +significantly reducing the complexity of VL tracking modeling and allowing our +tracker to use a simple cross-entropy loss as unified optimization objective +for VL tracking task. Extensive experiments on TNL2K, LaSOT, LaSOT$_{\rm{ext}}$ +and OTB99-Lang benchmarks show that our approach achieves promising results, +compared to other state-of-the-arts. + +
+
+
+
+
+ + ☆ Superpixels algorithms through network community detection + + +
+ Community detection is a powerful tool from complex networks analysis that +finds applications in various research areas. Several image segmentation +methods rely for instance on community detection algorithms as a black box in +order to compute undersegmentations, i.e. a small number of regions that +represent areas of interest of the image. However, to the best of our +knowledge, the efficiency of such an approach w.r.t. superpixels, that aim at +representing the image at a smaller level while preserving as much as possible +original information, has been neglected so far. The only related work seems to +be the one by Liu et. al. (IET Image Processing, 2022) that developed a +superpixels algorithm using a so-called modularity maximization approach, +leading to relevant results. We follow this line of research by studying the +efficiency of superpixels computed by state-of-the-art community detection +algorithms on a 4-connected pixel graph, so-called pixel-grid. We first detect +communities on such a graph and then apply a simple merging procedure that +allows to obtain the desired number of superpixels. As we shall see, such +methods result in the computation of relevant superpixels as emphasized by both +qualitative and quantitative experiments, according to different widely-used +metrics based on ground-truth comparison or on superpixels only. We observe +that the choice of the community detection algorithm has a great impact on the +number of communities and hence on the merging procedure. Similarly, small +variations on the pixel-grid may provide different results from both +qualitative and quantitative viewpoints. For the sake of completeness, we +compare our results with those of several state-of-the-art superpixels +algorithms as computed by Stutz et al. (Computer Vision and Image +Understanding, 2018). + +
+
+
+
+
+ + ☆ Rethinking Exemplars for Continual Semantic Segmentation in Endoscopy + Scenes: Entropy-based Mini-Batch Pseudo-Replay + + +
+ Endoscopy is a widely used technique for the early detection of diseases or +robotic-assisted minimally invasive surgery (RMIS). Numerous deep learning +(DL)-based research works have been developed for automated diagnosis or +processing of endoscopic view. However, existing DL models may suffer from +catastrophic forgetting. When new target classes are introduced over time or +cross institutions, the performance of old classes may suffer severe +degradation. More seriously, data privacy and storage issues may lead to the +unavailability of old data when updating the model. Therefore, it is necessary +to develop a continual learning (CL) methodology to solve the problem of +catastrophic forgetting in endoscopic image segmentation. To tackle this, we +propose a Endoscopy Continual Semantic Segmentation (EndoCSS) framework that +does not involve the storage and privacy issues of exemplar data. The framework +includes a mini-batch pseudo-replay (MB-PR) mechanism and a self-adaptive noisy +cross-entropy (SAN-CE) loss. The MB-PR strategy circumvents privacy and storage +issues by generating pseudo-replay images through a generative model. +Meanwhile, the MB-PR strategy can also correct the model deviation to the +replay data and current training data, which is aroused by the significant +difference in the amount of current and replay images. Therefore, the model can +perform effective representation learning on both new and old tasks. SAN-CE +loss can help model fitting by adjusting the model's output logits, and also +improve the robustness of training. Extensive continual semantic segmentation +(CSS) experiments on public datasets demonstrate that our method can robustly +and effectively address the catastrophic forgetting brought by class increment +in endoscopy scenes. The results show that our framework holds excellent +potential for real-world deployment in a streaming learning manner. + +
+
+ comment: Accepted by Computers in Biology and Medicine +
+
+
+
+
+ + ☆ A comprehensive review on Plant Leaf Disease detection using Deep + learning + + +
+ Leaf disease is a common fatal disease for plants. Early diagnosis and +detection is necessary in order to improve the prognosis of leaf diseases +affecting plant. For predicting leaf disease, several automated systems have +already been developed using different plant pathology imaging modalities. This +paper provides a systematic review of the literature on leaf disease-based +models for the diagnosis of various plant leaf diseases via deep learning. The +advantages and limitations of different deep learning models including Vision +Transformer (ViT), Deep convolutional neural network (DCNN), Convolutional +neural network (CNN), Residual Skip Network-based Super-Resolution for Leaf +Disease Detection (RSNSR-LDD), Disease Detection Network (DDN), and YOLO (You +only look once) are described in this review. The review also shows that the +studies related to leaf disease detection applied different deep learning +models to a number of publicly available datasets. For comparing the +performance of the models, different metrics such as accuracy, precision, +recall, etc. were used in the existing studies. + +
+
+
+
+
+ + ☆ Practical Edge Detection via Robust Collaborative Learning + + +
+ Edge detection, as a core component in a wide range of visionoriented tasks, +is to identify object boundaries and prominent edges in natural images. An edge +detector is desired to be both efficient and accurate for practical use. To +achieve the goal, two key issues should be concerned: 1) How to liberate deep +edge models from inefficient pre-trained backbones that are leveraged by most +existing deep learning methods, for saving the computational cost and cutting +the model size; and 2) How to mitigate the negative influence from noisy or +even wrong labels in training data, which widely exist in edge detection due to +the subjectivity and ambiguity of annotators, for the robustness and accuracy. +In this paper, we attempt to simultaneously address the above problems via +developing a collaborative learning based model, termed PEdger. The principle +behind our PEdger is that, the information learned from different training +moments and heterogeneous (recurrent and non recurrent in this work) +architectures, can be assembled to explore robust knowledge against noisy +annotations, even without the help of pre-training on extra data. Extensive +ablation studies together with quantitative and qualitative experimental +comparisons on the BSDS500 and NYUD datasets are conducted to verify the +effectiveness of our design, and demonstrate its superiority over other +competitors in terms of accuracy, speed, and model size. Codes can be found at +https://github.co/ForawardStar/PEdger. + +
+
+
+
+
+ + ☆ 4D Myocardium Reconstruction with Decoupled Motion and Shape Model ICCV2023 + + +
+ Estimating the shape and motion state of the myocardium is essential in +diagnosing cardiovascular diseases.However, cine magnetic resonance (CMR) +imaging is dominated by 2D slices, whose large slice spacing challenges +inter-slice shape reconstruction and motion acquisition.To address this +problem, we propose a 4D reconstruction method that decouples motion and shape, +which can predict the inter-/intra- shape and motion estimation from a given +sparse point cloud sequence obtained from limited slices. Our framework +comprises a neural motion model and an end-diastolic (ED) shape model. The +implicit ED shape model can learn a continuous boundary and encourage the +motion model to predict without the supervision of ground truth deformation, +and the motion model enables canonical input of the shape model by deforming +any point from any phase to the ED phase. Additionally, the constructed +ED-space enables pre-training of the shape model, thereby guiding the motion +model and addressing the issue of data scarcity. We propose the first 4D +myocardial dataset as we know and verify our method on the proposed, public, +and cross-modal datasets, showing superior reconstruction performance and +enabling various clinical applications. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Reconstructing Interacting Hands with Interaction Prior from Monocular + Images ICCV2023 + + +
+ Reconstructing interacting hands from monocular images is indispensable in +AR/VR applications. Most existing solutions rely on the accurate localization +of each skeleton joint. However, these methods tend to be unreliable due to the +severe occlusion and confusing similarity among adjacent hand parts. This also +defies human perception because humans can quickly imitate an interaction +pattern without localizing all joints. Our key idea is to first construct a +two-hand interaction prior and recast the interaction reconstruction task as +the conditional sampling from the prior. To expand more interaction states, a +large-scale multimodal dataset with physical plausibility is proposed. Then a +VAE is trained to further condense these interaction patterns as latent codes +in a prior distribution. When looking for image cues that contribute to +interaction prior sampling, we propose the interaction adjacency heatmap (IAH). +Compared with a joint-wise heatmap for localization, IAH assigns denser visible +features to those invisible joints. Compared with an all-in-one visible +heatmap, it provides more fine-grained local interaction information in each +interaction region. Finally, the correlations between the extracted features +and corresponding interaction codes are linked by the ViT module. Comprehensive +evaluations on benchmark datasets have verified the effectiveness of this +framework. The code and dataset are publicly available at +https://github.com/binghui-z/InterPrior_pytorch + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ U-SEANNet: A Simple, Efficient and Applied U-Shaped Network for + Diagnosing Nasal Diseases from Nasal Endoscopic Images + + +
+ Utilizing deep learning (DL) models to improve the early diagnosis of nasal +diseases from nasal endoscopic images holds paramount importance. However, the +lack of available datasets stymies advancements in this field. Furthermore, +existing models fail to strike a good trade-off between model diagnosis +performance, model complexity and parameter size, rendering them unsuitable for +practical application. To bridge these gaps, we created the first large-scale +nasal endoscopy dataset, named 7-NasEID, comprising 11,352 images that span six +nasal diseases and normal samples. Building on this, we proposed U-SEANNet, an +innovative architecture, underpinned by depth-wise separable convolutions. +Additionally, to augment its discernment capabilities for subtle variations in +input images, we further proposed the Global-Local Channel Feature Fusion +Module, enabling the U-SEANNet to focus salient channel features from both +global and local contexts. Notably, U-SEANNet's parameter size and GFLOPs are +only 0.78M and 0.21, respectively. Employing the 7-NasalEID, we conducted the +five-fold cross-validation on U-SEANNet, juxtaposing its performance against +seventeen renowned architectures. The experimental results suggest U-SEANNet as +the state-of-the-art (SOTA) model, achieves an accuracy of 93.58%, sensitivity +of 90.17%, and specificity of 91.27%. These findings demonstrate U-SEANNet's +prodigious potential for diagnosing nasal diseases in practical use, providing +the development of efficacy nasal diseases diagnosis tools with a new insight. + +
+
+
+
+
+ + ☆ Sparse3D: Distilling Multiview-Consistent Diffusion for Object + Reconstruction from Sparse Views + + +
+ Reconstructing 3D objects from extremely sparse views is a long-standing and +challenging problem. While recent techniques employ image diffusion models for +generating plausible images at novel viewpoints or for distilling pre-trained +diffusion priors into 3D representations using score distillation sampling +(SDS), these methods often struggle to simultaneously achieve high-quality, +consistent, and detailed results for both novel-view synthesis (NVS) and +geometry. In this work, we present Sparse3D, a novel 3D reconstruction method +tailored for sparse view inputs. Our approach distills robust priors from a +multiview-consistent diffusion model to refine a neural radiance field. +Specifically, we employ a controller that harnesses epipolar features from +input views, guiding a pre-trained diffusion model, such as Stable Diffusion, +to produce novel-view images that maintain 3D consistency with the input. By +tapping into 2D priors from powerful image diffusion models, our integrated +model consistently delivers high-quality results, even when faced with +open-world objects. To address the blurriness introduced by conventional SDS, +we introduce the category-score distillation sampling (C-SDS) to enhance +detail. We conduct experiments on CO3DV2 which is a multi-view dataset of +real-world objects. Both quantitative and qualitative evaluations demonstrate +that our approach outperforms previous state-of-the-art works on the metrics +regarding NVS and geometry reconstruction. + +
+
+
+
+
+ + ☆ A Novel Multi-scale Attention Feature Extraction Block for Aerial Remote + Sensing Image Classification + + +
+ Classification of very high-resolution (VHR) aerial remote sensing (RS) +images is a well-established research area in the remote sensing community as +it provides valuable spatial information for decision-making. Existing works on +VHR aerial RS image classification produce an excellent classification +performance; nevertheless, they have a limited capability to well-represent VHR +RS images having complex and small objects, thereby leading to performance +instability. As such, we propose a novel plug-and-play multi-scale attention +feature extraction block (MSAFEB) based on multi-scale convolution at two +levels with skip connection, producing discriminative/salient information at a +deeper/finer level. The experimental study on two benchmark VHR aerial RS image +datasets (AID and NWPU) demonstrates that our proposal achieves a +stable/consistent performance (minimum standard deviation of $0.002$) and +competent overall classification performance (AID: 95.85\% and NWPU: 94.09\%). + +
+
+ comment: The paper is under review in IEEE Geoscience and Remote Sensing + Letters Journal (IEEE-GRSL). This version may be deleted and/or updated based + on the journal's policy +
+
+
+
+
+ + ☆ FaceCoresetNet: Differentiable Coresets for Face Set Recognition + + +
+ In set-based face recognition, we aim to compute the most discriminative +descriptor from an unbounded set of images and videos showing a single person. +A discriminative descriptor balances two policies when aggregating information +from a given set. The first is a quality-based policy: emphasizing high-quality +and down-weighting low-quality images. The second is a diversity-based policy: +emphasizing unique images in the set and down-weighting multiple occurrences of +similar images as found in video clips which can overwhelm the set +representation. This work frames face-set representation as a differentiable +coreset selection problem. Our model learns how to select a small coreset of +the input set that balances quality and diversity policies using a learned +metric parameterized by the face quality, optimized end-to-end. The selection +process is a differentiable farthest-point sampling (FPS) realized by +approximating the non-differentiable Argmax operation with differentiable +sampling from the Gumbel-Softmax distribution of distances. The small coreset +is later used as queries in a self and cross-attention architecture to enrich +the descriptor with information from the whole set. Our model is +order-invariant and linear in the input set size. We set a new SOTA to set face +verification on the IJB-B and IJB-C datasets. Our code is publicly available. + +
+
+
+
+
+ + ☆ Nonrigid Object Contact Estimation With Regional Unwrapping Transformer ICCV2023 + + +
+ Acquiring contact patterns between hands and nonrigid objects is a common +concern in the vision and robotics community. However, existing learning-based +methods focus more on contact with rigid ones from monocular images. When +adopting them for nonrigid contact, a major problem is that the existing +contact representation is restricted by the geometry of the object. +Consequently, contact neighborhoods are stored in an unordered manner and +contact features are difficult to align with image cues. At the core of our +approach lies a novel hand-object contact representation called RUPs (Region +Unwrapping Profiles), which unwrap the roughly estimated hand-object surfaces +as multiple high-resolution 2D regional profiles. The region grouping strategy +is consistent with the hand kinematic bone division because they are the +primitive initiators for a composite contact pattern. Based on this +representation, our Regional Unwrapping Transformer (RUFormer) learns the +correlation priors across regions from monocular inputs and predicts +corresponding contact and deformed transformations. Our experiments demonstrate +that the proposed framework can robustly estimate the deformed degrees and +deformed transformations, which makes it suitable for both nonrigid and rigid +contact. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ DETDet: Dual Ensemble Teeth Detection + + +
+ The field of dentistry is in the era of digital transformation. Particularly, +artificial intelligence is anticipated to play a significant role in digital +dentistry. AI holds the potential to significantly assist dental practitioners +and elevate diagnostic accuracy. In alignment with this vision, the 2023 MICCAI +DENTEX challenge aims to enhance the performance of dental panoramic X-ray +diagnosis and enumeration through technological advancement. In response, we +introduce DETDet, a Dual Ensemble Teeth Detection network. DETDet encompasses +two distinct modules dedicated to enumeration and diagnosis. Leveraging the +advantages of teeth mask data, we employ Mask-RCNN for the enumeration module. +For the diagnosis module, we adopt an ensemble model comprising DiffusionDet +and DINO. To further enhance precision scores, we integrate a complementary +module to harness the potential of unlabeled data. The code for our approach +will be made accessible at https://github.com/Bestever-choi/Evident + +
+
+
+
+
+ + ☆ Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential + Generative Adversarial Networks + + +
+ In this paper, we propose a bi-modality medical image synthesis approach +based on sequential generative adversarial network (GAN) and semi-supervised +learning. Our approach consists of two generative modules that synthesize +images of the two modalities in a sequential order. A method for measuring the +synthesis complexity is proposed to automatically determine the synthesis order +in our sequential GAN. Images of the modality with a lower complexity are +synthesized first, and the counterparts with a higher complexity are generated +later. Our sequential GAN is trained end-to-end in a semi-supervised manner. In +supervised training, the joint distribution of bi-modality images are learned +from real paired images of the two modalities by explicitly minimizing the +reconstruction losses between the real and synthetic images. To avoid +overfitting limited training images, in unsupervised training, the marginal +distribution of each modality is learned based on unpaired images by minimizing +the Wasserstein distance between the distributions of real and fake images. We +comprehensively evaluate the proposed model using two synthesis tasks based on +three types of evaluate metrics and user studies. Visual and quantitative +results demonstrate the superiority of our method to the state-of-the-art +methods, and reasonable visual quality and clinical significance. Code is made +publicly available at +https://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis. + +
+
+
+
+
+ + ☆ Multi-model fusion for Aerial Vision and Dialog Navigation based on + human attention aids + + +
+ Drones have been widely used in many areas of our daily lives. It relieves +people of the burden of holding a controller all the time and makes drone +control easier to use for people with disabilities or occupied hands. However, +the control of aerial robots is more complicated compared to normal robots due +to factors such as uncontrollable height. Therefore, it is crucial to develop +an intelligent UAV that has the ability to talk to humans and follow natural +language commands. In this report, we present an aerial navigation task for the +2023 ICCV Conversation History. Based on the AVDN dataset containing more than +3k recorded navigation trajectories and asynchronous human-robot conversations, +we propose an effective method of fusion training of Human Attention Aided +Transformer model (HAA-Transformer) and Human Attention Aided LSTM (HAA-LSTM) +model, which achieves the prediction of the navigation routing points and human +attention. The method not only achieves high SR and SPL metrics, but also shows +a 7% improvement in GP metrics compared to the baseline model. + +
+
+ comment: 4 pages, 1 figures +
+
+
+
+
+ + ☆ Hierarchical Contrastive Learning for Pattern-Generalizable Image + Corruption Detection ICCV 2023 + + +
+ Effective image restoration with large-size corruptions, such as blind image +inpainting, entails precise detection of corruption region masks which remains +extremely challenging due to diverse shapes and patterns of corruptions. In +this work, we present a novel method for automatic corruption detection, which +allows for blind corruption restoration without known corruption masks. +Specifically, we develop a hierarchical contrastive learning framework to +detect corrupted regions by capturing the intrinsic semantic distinctions +between corrupted and uncorrupted regions. In particular, our model detects the +corrupted mask in a coarse-to-fine manner by first predicting a coarse mask by +contrastive learning in low-resolution feature space and then refines the +uncertain area of the mask by high-resolution contrastive learning. A +specialized hierarchical interaction mechanism is designed to facilitate the +knowledge propagation of contrastive learning in different scales, boosting the +modeling performance substantially. The detected multi-scale corruption masks +are then leveraged to guide the corruption restoration. Detecting corrupted +regions by learning the contrastive distinctions rather than the semantic +patterns of corruptions, our model has well generalization ability across +different corruption patterns. Extensive experiments demonstrate following +merits of our model: 1) the superior performance over other methods on both +corruption detection and various image restoration tasks including blind +inpainting and watermark removal, and 2) strong generalization across different +corruption patterns such as graffiti, random noise or other image content. +Codes and trained weights are available at https://github.com/xyfJASON/HCL . + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Pruning the Unlabeled Data to Improve Semi-Supervised Learning + + +
+ In the domain of semi-supervised learning (SSL), the conventional approach +involves training a learner with a limited amount of labeled data alongside a +substantial volume of unlabeled data, both drawn from the same underlying +distribution. However, for deep learning models, this standard practice may not +yield optimal results. In this research, we propose an alternative perspective, +suggesting that distributions that are more readily separable could offer +superior benefits to the learner as compared to the original distribution. To +achieve this, we present PruneSSL, a practical technique for selectively +removing examples from the original unlabeled dataset to enhance its +separability. We present an empirical study, showing that although PruneSSL +reduces the quantity of available training data for the learner, it +significantly improves the performance of various competitive SSL algorithms, +thereby achieving state-of-the-art results across several image classification +tasks. + +
+
+
+
+
+ + ♻ ☆ Masked Diffusion as Self-supervised Representation Learner + + +
+ Denoising diffusion probabilistic models have recently demonstrated +state-of-the-art generative performance and been used as strong pixel-level +representation learners. This paper decomposes the interrelation between the +generative capability and representation learning ability inherent in diffusion +models. We present masked diffusion model (MDM), a scalable self-supervised +representation learner that substitutes the conventional additive Gaussian +noise of traditional diffusion with a masking mechanism. Our proposed approach +convincingly surpasses prior benchmarks, demonstrating remarkable advancements +in both medical and natural image semantic segmentation tasks, particularly +within the context of few-shot scenario. + +
+
+
+
+
+ + ♻ ☆ Learning Melanocytic Cell Masks from Adjacent Stained Tissue MICCAI + 2022 + + +
+ Melanoma is one of the most aggressive forms of skin cancer, causing a large +proportion of skin cancer deaths. However, melanoma diagnoses by pathologists +shows low interrater reliability. As melanoma is a cancer of the melanocyte, +there is a clear need to develop a melanocytic cell segmentation tool that is +agnostic to pathologist variability and automates pixel-level annotation. +Gigapixel-level pathologist labeling, however, is impractical. Herein, we +propose a means to train deep neural networks for melanocytic cell segmentation +from hematoxylin and eosin (H&E) stained slides using paired +immunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean +IOU of 0.64 despite imperfect ground-truth labels. + +
+
+ comment: {Medical Image Learning with Limited & Noisy Data Workshop at MICCAI + 2022 +
+
+
+
+
+ + ♻ ☆ Day2Dark: Pseudo-Supervised Activity Recognition beyond Silent Daylight + + +
+ This paper strives to recognize activities in the dark, as well as in the +day. We first establish that state-of-the-art activity recognizers are +effective during the day, but not trustworthy in the dark. The main causes are +the limited availability of labeled dark videos to learn from, as well as the +distribution shift towards the lower color contrast at test-time. To compensate +for the lack of labeled dark videos, we introduce a pseudo-supervised learning +scheme, which utilizes easy to obtain unlabeled and task-irrelevant dark videos +to improve an activity recognizer in low light. As the lower color contrast +results in visual information loss, we further propose to incorporate the +complementary activity information within audio, which is invariant to +illumination. Since the usefulness of audio and visual features differs +depending on the amount of illumination, we introduce our `darkness-adaptive' +audio-visual recognizer. Experiments on EPIC-Kitchens, Kinetics-Sound, and +Charades demonstrate our proposals are superior to image enhancement, domain +adaptation and alternative audio-visual fusion methods, and can even improve +robustness to local darkness caused by occlusions. Project page: +https://xiaobai1217.github.io/Day2Dark/ + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence for Automatic Detection and Classification + Disease on the X-Ray Images + + +
+ Detecting and classifying diseases using X-ray images is one of the more +challenging core tasks in the medical and research world. Due to the recent +high interest in radiological images and AI, early detection of diseases in +X-ray images has become notably more essential to prevent further spreading and +flatten the curve. Innovations and revolutions of Computer Vision with Deep +learning methods offer great promise for fast and accurate diagnosis of +screening and detection from chest X-ray images (CXR). This work presents rapid +detection of diseases in the lung using the efficient Deep learning pre-trained +RepVGG algorithm for deep feature extraction and classification. We used X-ray +images as an example to show the model's efficiency. To perform this task, we +classify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ +ROI object to improve the detection accuracy for lung extraction, followed by +data pre-processing and augmentation. We are applying Artificial Intelligence +technology for automatic highlighted detection of affected areas of people's +lungs. Based on the X-Ray images, an algorithm was developed that classifies +X-Ray images with height accuracy and power faster thanks to the architecture +transformation of the model. We compared deep learning frameworks' accuracy and +detection of disease. The study shows the high power of deep learning methods +for X-ray images based on COVID-19 detection utilizing chest X-rays. The +proposed framework offers better diagnostic accuracy by comparing popular deep +learning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and +InceptionResnetV2. + +
+
+
+
+
+ + ♻ ☆ Implicit Autoencoder for Point-Cloud Self-Supervised Representation + Learning ICCV 2023 + + +
+ This paper advocates the use of implicit surface representation in +autoencoder-based self-supervised 3D representation learning. The most popular +and accessible 3D representation, i.e., point clouds, involves discrete samples +of the underlying continuous 3D surface. This discretization process introduces +sampling variations on the 3D shape, making it challenging to develop +transferable knowledge of the true 3D geometry. In the standard autoencoding +paradigm, the encoder is compelled to encode not only the 3D geometry but also +information on the specific discrete sampling of the 3D shape into the latent +code. This is because the point cloud reconstructed by the decoder is +considered unacceptable unless there is a perfect mapping between the original +and the reconstructed point clouds. This paper introduces the Implicit +AutoEncoder (IAE), a simple yet effective method that addresses the sampling +variation issue by replacing the commonly-used point-cloud decoder with an +implicit decoder. The implicit decoder reconstructs a continuous representation +of the 3D shape, independent of the imperfections in the discrete samples. +Extensive experiments demonstrate that the proposed IAE achieves +state-of-the-art performance across various self-supervised learning +benchmarks. + +
+
+ comment: Published in ICCV 2023. The code is available at + https://github.com/SimingYan/IAE +
+
+
+
+
+ + ♻ ☆ Few-shot Forgery Detection via Guided Adversarial Interpolation + + +
+ The increase in face manipulation models has led to a critical issue in +society - the synthesis of realistic visual media. With the emergence of new +forgery approaches at an unprecedented rate, existing forgery detection methods +suffer from significant performance drops when applied to unseen novel forgery +approaches. In this work, we address the few-shot forgery detection problem by +1) designing a comprehensive benchmark based on coverage analysis among various +forgery approaches, and 2) proposing Guided Adversarial Interpolation (GAI). +Our key insight is that there exist transferable distribution characteristics +between majority and minority forgery classes1. Specifically, we enhance the +discriminative ability against novel forgery approaches via adversarially +interpolating the forgery artifacts of the minority samples to the majority +samples under the guidance of a teacher network. Unlike the standard +re-balancing method which usually results in over-fitting to minority classes, +our method simultaneously takes account of the diversity of majority +information as well as the significance of minority information. Extensive +experiments demonstrate that our GAI achieves state-of-the-art performances on +the established few-shot forgery detection benchmark. Notably, our method is +also validated to be robust to choices of majority and minority forgery +approaches. The formal publication version is available in Pattern Recognition. + +
+
+
+
+
+ + ♻ ☆ Local Context-Aware Active Domain Adaptation ICCV 2023 + + +
+ Active Domain Adaptation (ADA) queries the labels of a small number of +selected target samples to help adapting a model from a source domain to a +target domain. The local context of queried data is important, especially when +the domain gap is large. However, this has not been fully explored by existing +ADA works. In this paper, we propose a Local context-aware ADA framework, named +LADA, to address this issue. To select informative target samples, we devise a +novel criterion based on the local inconsistency of model predictions. Since +the labeling budget is usually small, fine-tuning model on only queried data +can be inefficient. We progressively augment labeled target data with the +confident neighbors in a class-balanced manner. Experiments validate that the +proposed criterion chooses more informative target samples than existing active +selection strategies. Furthermore, our full method clearly surpasses recent ADA +arts on various benchmarks. Code is available at https://github.com/tsun/LADA. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SurroundOcc: Multi-Camera 3D Occupancy Prediction for Autonomous Driving ICCV 2023 + + +
+ 3D scene understanding plays a vital role in vision-based autonomous driving. +While most existing methods focus on 3D object detection, they have difficulty +describing real-world objects of arbitrary shapes and infinite classes. Towards +a more comprehensive perception of a 3D scene, in this paper, we propose a +SurroundOcc method to predict the 3D occupancy with multi-camera images. We +first extract multi-scale features for each image and adopt spatial 2D-3D +attention to lift them to the 3D volume space. Then we apply 3D convolutions to +progressively upsample the volume features and impose supervision on multiple +levels. To obtain dense occupancy prediction, we design a pipeline to generate +dense occupancy ground truth without expansive occupancy annotations. +Specifically, we fuse multi-frame LiDAR scans of dynamic objects and static +scenes separately. Then we adopt Poisson Reconstruction to fill the holes and +voxelize the mesh to get dense occupancy labels. Extensive experiments on +nuScenes and SemanticKITTI datasets demonstrate the superiority of our method. +Code and dataset are available at https://github.com/weiyithu/SurroundOcc + +
+
+ comment: Accepted to ICCV 2023. Code is available at + https://github.com/weiyithu/SurroundOcc +
+
+
+
+
+ + ♻ ☆ VDD: Varied Drone Dataset for Semantic Segmentation + + +
+ Semantic segmentation of drone images is critical to many aerial vision tasks +as it provides essential semantic details that can compensate for the lack of +depth information from monocular cameras. However, maintaining high accuracy of +semantic segmentation models for drones requires diverse, large-scale, and +high-resolution datasets, which are rare in the field of aerial image +processing. Existing datasets are typically small and focus primarily on urban +scenes, neglecting rural and industrial areas. Models trained on such datasets +are not sufficiently equipped to handle the variety of inputs seen in drone +imagery. In the VDD-Varied Drone Dataset, we offer a large-scale and densely +labeled dataset comprising 400 high-resolution images that feature carefully +chosen scenes, camera angles, and varied light and weather conditions. +Furthermore, we have adapted existing drone datasets to conform to our +annotation standards and integrated them with VDD to create a dataset 1.5 times +the size of fine annotation of Cityscapes. We have developed a novel DeepLabT +model, which combines CNN and Transformer backbones, to provide a reliable +baseline for semantic segmentation in drone imagery. Our experiments indicate +that DeepLabT performs admirably on VDD and other drone datasets. We expect +that our dataset will generate considerable interest in drone image +segmentation and serve as a foundation for other drone vision tasks. VDD is +freely available on our website at https://vddvdd.com . + +
+
+
+
+
+ + ♻ ☆ VMA: Divide-and-Conquer Vectorized Map Annotation System for Large-Scale + Driving Scene + + +
+ High-definition (HD) map serves as the essential infrastructure of autonomous +driving. In this work, we build up a systematic vectorized map annotation +framework (termed VMA) for efficiently generating HD map of large-scale driving +scene. We design a divide-and-conquer annotation scheme to solve the spatial +extensibility problem of HD map generation, and abstract map elements with a +variety of geometric patterns as unified point sequence representation, which +can be extended to most map elements in the driving scene. VMA is highly +efficient and extensible, requiring negligible human effort, and flexible in +terms of spatial scale and element type. We quantitatively and qualitatively +validate the annotation performance on real-world urban and highway scenes, as +well as NYC Planimetric Database. VMA can significantly improve map generation +efficiency and require little human effort. On average VMA takes 160min for +annotating a scene with a range of hundreds of meters, and reduces 52.3% of the +human cost, showing great application value. Code: +https://github.com/hustvl/VMA. + +
+
+ comment: https://github.com/hustvl/VMA +
+
+
+
+
+ + ♻ ☆ Single image reflection removal via learning with multi-image + constraints + + +
+ Reflections are very common phenomena in our daily photography, which +distract people's attention from the scene behind the glass. The problem of +removing reflection artifacts is important but challenging due to its ill-posed +nature. The traditional approaches solve an optimization problem over the +constraints induced from multiple images, at the expense of large computation +costs. Recent learning-based approaches have demonstrated a significant +improvement in both performance and running time for single image reflection +removal, but are limited as they require a large number of synthetic +reflection/clean image pairs for direct supervision to approximate the ground +truth, at the risk of overfitting in the synthetic image domain and degrading +in the real image domain. In this paper, we propose a novel learning-based +solution that combines the advantages of the aforementioned approaches and +overcomes their drawbacks. Our algorithm works by learning a deep neural +network to optimize the target with joint constraints enhanced among multiple +input images during the training phase, but is able to eliminate reflections +only from a single input for evaluation. Our algorithm runs in real-time and +achieves state-of-the-art reflection removal performance on real images. We +further propose a strong network backbone that disentangles the background and +reflection information into separate latent codes, which are embedded into a +shared one-branch deep neural network for both background and reflection +predictions. The proposed backbone experimentally performs better than the +other common network implementations, and provides insightful knowledge to +understand the reflection removal task. + +
+
+
+
+
+ + ♻ ☆ LXL: LiDAR Excluded Lean 3D Object Detection with 4D Imaging Radar and + Camera Fusion + + +
+ As an emerging technology and a relatively affordable device, the 4D imaging +radar has already been confirmed effective in performing 3D object detection in +autonomous driving. Nevertheless, the sparsity and noisiness of 4D radar point +clouds hinder further performance improvement, and in-depth studies about its +fusion with other modalities are lacking. On the other hand, as a new image +view transformation strategy, "sampling" has been applied in a few image-based +detectors and shown to outperform the widely applied "depth-based splatting" +proposed in Lift-Splat-Shoot (LSS), even without image depth prediction. +However, the potential of "sampling" is not fully unleashed. In this paper, we +investigate the "sampling" view transformation strategy on the camera and 4D +imaging radar fusion-based 3D object detection. In the proposed LiDAR Excluded +Lean (LXL) model, predicted image depth distribution maps and radar 3D +occupancy grids are generated from image perspective view (PV) features and +radar bird's eye view (BEV) features, respectively. They are sent to the core +of LXL, called "radar occupancy-assisted depth-based sampling", to aid image +view transformation. Introducing image depths and radar information enhances +the "sampling" strategy and leads to more accurate view transformation. +Experiments on VoD and TJ4DRadSet datasets show that the proposed method +outperforms the state-of-the-art 3D object detection methods by a significant +margin without bells and whistles. Ablation studies demonstrate that our method +performs the best among different enhancement settings. + +
+
+
+
+
+ + ♻ ☆ SimpleMapping: Real-Time Visual-Inertial Dense Mapping with Deep + Multi-View Stereo + + +
+ We present a real-time visual-inertial dense mapping method capable of +performing incremental 3D mesh reconstruction with high quality using only +sequential monocular images and inertial measurement unit (IMU) readings. 6-DoF +camera poses are estimated by a robust feature-based visual-inertial odometry +(VIO), which also generates noisy sparse 3D map points as a by-product. We +propose a sparse point aided multi-view stereo neural network (SPA-MVSNet) that +can effectively leverage the informative but noisy sparse points from the VIO +system. The sparse depth from VIO is firstly completed by a single-view depth +completion network. This dense depth map, although naturally limited in +accuracy, is then used as a prior to guide our MVS network in the cost volume +generation and regularization for accurate dense depth prediction. Predicted +depth maps of keyframe images by the MVS network are incrementally fused into a +global map using TSDF-Fusion. We extensively evaluate both the proposed +SPA-MVSNet and the entire visual-inertial dense mapping system on several +public datasets as well as our own dataset, demonstrating the system's +impressive generalization capabilities and its ability to deliver high-quality +3D mesh reconstruction online. Our proposed dense mapping system achieves a +39.7% improvement in F-score over existing systems when evaluated on the +challenging scenarios of the EuRoC dataset. + +
+
+
+
+
+ + ♻ ☆ EEP-3DQA: Efficient and Effective Projection-based 3D Model Quality + Assessment + + +
+ Currently, great numbers of efforts have been put into improving the +effectiveness of 3D model quality assessment (3DQA) methods. However, little +attention has been paid to the computational costs and inference time, which is +also important for practical applications. Unlike 2D media, 3D models are +represented by more complicated and irregular digital formats, such as point +cloud and mesh. Thus it is normally difficult to perform an efficient module to +extract quality-aware features of 3D models. In this paper, we address this +problem from the aspect of projection-based 3DQA and develop a no-reference +(NR) \underline{E}fficient and \underline{E}ffective +\underline{P}rojection-based \underline{3D} Model \underline{Q}uality +\underline{A}ssessment (\textbf{EEP-3DQA}) method. The input projection images +of EEP-3DQA are randomly sampled from the six perpendicular viewpoints of the +3D model and are further spatially downsampled by the grid-mini patch sampling +strategy. Further, the lightweight Swin-Transformer tiny is utilized as the +backbone to extract the quality-aware features. Finally, the proposed EEP-3DQA +and EEP-3DQA-t (tiny version) achieve the best performance than the existing +state-of-the-art NR-3DQA methods and even outperforms most full-reference (FR) +3DQA methods on the point cloud and mesh quality assessment databases while +consuming less inference time than the compared 3DQA methods. + +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Distributional Off-Policy Evaluation for Slate Recommendations + + +
+ Recommendation strategies are typically evaluated by using previously logged +data, employing off-policy evaluation methods to estimate their expected +performance. However, for strategies that present users with slates of multiple +items, the resulting combinatorial action space renders many of these methods +impractical. Prior work has developed estimators that leverage the structure in +slates to estimate the expected off-policy performance, but the estimation of +the entire performance distribution remains elusive. Estimating the complete +distribution allows for a more comprehensive evaluation of recommendation +strategies, particularly along the axes of risk and fairness that employ +metrics computable from the distribution. In this paper, we propose an +estimator for the complete off-policy performance distribution for slates and +establish conditions under which the estimator is unbiased and consistent. This +builds upon prior work on off-policy evaluation for slates and off-policy +distribution estimation in reinforcement learning. We validate the efficacy of +our method empirically on synthetic data as well as on a slate recommendation +simulator constructed from real-world data (MovieLens-20M). Our results show a +significant reduction in estimation variance and improved sample efficiency +over prior work across a range of slate structures. + +
+
+
+
+
+ + ☆ Only Encode Once: Making Content-based News Recommender Greener + + +
+ Large pretrained language models (PLM) have become de facto news encoders in +modern news recommender systems, due to their strong ability in comprehending +textual content. These huge Transformer-based architectures, when finetuned on +recommendation tasks, can greatly improve news recommendation performance. +However, the PLM-based pretrain-finetune framework incurs high computational +cost and energy consumption, primarily due to the extensive redundant +processing of news encoding during each training epoch. In this paper, we +propose the ``Only Encode Once'' framework for news recommendation (OLEO), by +decoupling news representation learning from downstream recommendation task +learning. The decoupled design makes content-based news recommender as green +and efficient as id-based ones, leading to great reduction in computational +cost and training resources. Extensive experiments show that our OLEO framework +can reduce carbon emissions by up to 13 times compared with the +state-of-the-art pretrain-finetune framework and maintain a competitive or even +superior performance level. The source code is released for reproducibility. + +
+
+
+
+
+ + ☆ CTR is not Enough: a Novel Reinforcement Learning based Ranking Approach + for Optimizing Session Clicks + + +
+ Ranking is a crucial module using in the recommender system. In particular, +the ranking module using in our YoungTao recommendation scenario is to provide +an ordered list of items to users, to maximize the click number throughout the +recommendation session for each user. However, we found that the traditional +ranking method for optimizing Click-Through rate(CTR) cannot address our +ranking scenario well, since it completely ignores user leaving, and CTR is the +optimization goal for the one-step recommendation. To effectively undertake the +purpose of our ranking module, we propose a long-term optimization goal, named +as CTE (Click-Through quantity expectation), for explicitly taking the behavior +of user leaving into account. Based on CTE, we propose an effective model +trained by reinforcement learning. Moreover, we build a simulation environment +from offline log data for estimating PBR and CTR. We conduct extensive +experiments on offline datasets and an online e-commerce scenario in TaoBao. +Experimental results show that our method can boost performance effectively + +
+
+
+
+
+ + ☆ Text Matching Improves Sequential Recommendation by Reducing Popularity + Biases CIKM 2023 + + +
+ This paper proposes Text mAtching based SequenTial rEcommendation model +(TASTE), which maps items and users in an embedding space and recommends items +by matching their text representations. TASTE verbalizes items and user-item +interactions using identifiers and attributes of items. To better characterize +user behaviors, TASTE additionally proposes an attention sparsity method, which +enables TASTE to model longer user-item interactions by reducing the +self-attention computations during encoding. Our experiments show that TASTE +outperforms the state-of-the-art methods on widely used sequential +recommendation datasets. TASTE alleviates the cold start problem by +representing long-tail items using full-text modeling and bringing the benefits +of pretrained language models to recommendation systems. Our further analyses +illustrate that TASTE significantly improves the recommendation accuracy by +reducing the popularity bias of previous item id based recommendation models +and returning more appropriate and text-relevant items to satisfy users. All +codes are available at https://github.com/OpenMatch/TASTE. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Analyzing and visualizing polarization and balance with signed networks: + the U.S. Congress case study + + +
+ Signed networks and balance theory provide a natural setting for real-world +scenarios that show polarization dynamics, positive/negative relationships, and +political partisanship. For example, they have been proven effective in +studying the increasing polarization of the votes in the two chambers of the +U.S. Congress from World War II on. + To provide further insights into this particular case study, we propose the +application of a pipeline to analyze and visualize a signed graph's +configuration based on the exploitation of the corresponding Laplacian matrix' +spectral properties. The overall methodology is comparable with others based on +the frustration index, but it has at least two main advantages: first, it +requires a much lower computational cost; second, it allows for a quantitative +and visual assessment of how arbitrarily small subgraphs (even single nodes) +contribute to the overall balance (or unbalance) of the network. + The proposed pipeline allows the exploration of polarization dynamics shown +by the U.S. Congress from 1945 to 2020 at different resolution scales. In fact, +we are able to spot and point out the influence of some (groups of) congressmen +in the overall balance, as well as to observe and explore polarization's +evolution of both chambers across the years. + +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ Causal Decision Transformer for Recommender Systems via Offline + Reinforcement Learning SIGIR'23 + + +
+ Reinforcement learning-based recommender systems have recently gained +popularity. However, the design of the reward function, on which the agent +relies to optimize its recommendation policy, is often not straightforward. +Exploring the causality underlying users' behavior can take the place of the +reward function in guiding the agent to capture the dynamic interests of users. +Moreover, due to the typical limitations of simulation environments (e.g., data +inefficiency), most of the work cannot be broadly applied in large-scale +situations. Although some works attempt to convert the offline dataset into a +simulator, data inefficiency makes the learning process even slower. Because of +the nature of reinforcement learning (i.e., learning by interaction), it cannot +collect enough data to train during a single interaction. Furthermore, +traditional reinforcement learning algorithms do not have a solid capability +like supervised learning methods to learn from offline datasets directly. In +this paper, we propose a new model named the causal decision transformer for +recommender systems (CDT4Rec). CDT4Rec is an offline reinforcement learning +system that can learn from a dataset rather than from online interaction. +Moreover, CDT4Rec employs the transformer architecture, which is capable of +processing large offline datasets and capturing both short-term and long-term +dependencies within the data to estimate the causal relationship between +action, state, and reward. To demonstrate the feasibility and superiority of +our model, we have conducted experiments on six real-world offline datasets and +one online simulator. + +
+
+ comment: Accepted by SIGIR'23, please check the camera-ready version for more + details such as the implementation +
+
+
+
+
+
+
+
+ + Machine Learning 33 + +
+
+
+ + ☆ Modeling Player Personality Factors from In-Game Behavior and Affective + Expression + + +
+ Developing a thorough understanding of the target audience (and/or single +individuals) is a key factor for success - which is exceptionally important and +powerful for the domain of video games that can not only benefit from informed +decision making during development, but ideally even tailor game content, +difficulty and player experience while playing. The granular assessment of +individual personality and differences across players is a particularly +difficult endeavor, given the highly variant human nature, disagreement in +psychological background models and because of the effortful data collection +that most often builds upon long, time-consuming and deterrent questionnaires. +In this work, we explore possibilities to predict a series of player +personality questionnaire metrics from recorded in-game behavior and extend +related work by explicitly adding affective dialog decisions to the game +environment which could elevate the model's accuracy. Using random forest +regression, we predicted a wide variety of personality metrics from seven +established questionnaires across 62 players over 60 minute gameplay of a +customized version of the role-playing game Fallout: New Vegas. While some +personality variables could already be identified from reasonable underlying +in-game actions and affective expressions, we did not find ways to predict +others or encountered questionable correlations that could not be justified by +theoretical background literature. Yet, building on the initial opportunities +of this explorative study, we are striving to massively enlarge our data set to +players from an ecologically valid industrial game environment and investigate +the performance of more sophisticated machine learning approaches. + +
+
+
+
+
+ + ☆ On Active Learning for Gaussian Process-based Global Sensitivity + Analysis + + +
+ This paper explores the application of active learning strategies to +adaptively learn Sobol indices for global sensitivity analysis. We demonstrate +that active learning for Sobol indices poses unique challenges due to the +definition of the Sobol index as a ratio of variances estimated from Gaussian +process surrogates. Consequently, learning strategies must either focus on +convergence in the numerator or the denominator of this ratio. However, rapid +convergence in either one does not guarantee convergence in the Sobol index. We +propose a novel strategy for active learning that focuses on resolving the main +effects of the Gaussian process (associated with the numerator of the Sobol +index) and compare this with existing strategies based on convergence in the +total variance (the denominator of the Sobol index). The new strategy, +implemented through a new learning function termed the MUSIC (minimize +uncertainty in Sobol index convergence), generally converges in Sobol index +error more rapidly than the existing strategies based on the Expected +Improvement for Global Fit (EIGF) and the Variance Improvement for Global Fit +(VIGF). Both strategies are compared with simple sequential random sampling and +the MUSIC learning function generally converges most rapidly for +low-dimensional problems. However, for high-dimensional problems, the +performance is comparable to random sampling. The new learning strategy is +demonstrated for a practical case of adaptive experimental design for +large-scale Boundary Layer Wind Tunnel experiments. + +
+
+ comment: 31 pages, 16 figures +
+
+
+
+
+ + ☆ Machine Learning for Administrative Health Records: A Systematic Review + of Techniques and Applications + + +
+ Machine learning provides many powerful and effective techniques for +analysing heterogeneous electronic health records (EHR). Administrative Health +Records (AHR) are a subset of EHR collected for administrative purposes, and +the use of machine learning on AHRs is a growing subfield of EHR analytics. +Existing reviews of EHR analytics emphasise that the data-modality of the EHR +limits the breadth of suitable machine learning techniques, and pursuable +healthcare applications. Despite emphasising the importance of data modality, +the literature fails to analyse which techniques and applications are relevant +to AHRs. AHRs contain uniquely well-structured, categorically encoded records +which are distinct from other data-modalities captured by EHRs, and they can +provide valuable information pertaining to how patients interact with the +healthcare system. + This paper systematically reviews AHR-based research, analysing 70 relevant +studies and spanning multiple databases. We identify and analyse which machine +learning techniques are applied to AHRs and which health informatics +applications are pursued in AHR-based research. We also analyse how these +techniques are applied in pursuit of each application, and identify the +limitations of these approaches. We find that while AHR-based studies are +disconnected from each other, the use of AHRs in health informatics research is +substantial and accelerating. Our synthesis of these studies highlights the +utility of AHRs for pursuing increasingly complex and diverse research +objectives despite a number of pervading data- and technique-based limitations. +Finally, through our findings, we propose a set of future research directions +that can enhance the utility of AHR data and machine learning techniques for +health informatics research. + +
+
+
+
+
+ + ☆ TimeTrail: Unveiling Financial Fraud Patterns through Temporal + Correlation Analysis + + +
+ In the field of financial fraud detection, understanding the underlying +patterns and dynamics is important to ensure effective and reliable systems. +This research introduces a new technique, "TimeTrail," which employs advanced +temporal correlation analysis to explain complex financial fraud patterns. The +technique leverages time-related insights to provide transparent and +interpretable explanations for fraud detection decisions, enhancing +accountability and trust. + The "TimeTrail" methodology consists of three key phases: temporal data +enrichment, dynamic correlation analysis, and interpretable pattern +visualization. Initially, raw financial transaction data is enriched with +temporal attributes. Dynamic correlations between these attributes are then +quantified using innovative statistical measures. Finally, a unified +visualization framework presents these correlations in an interpretable manner. +To validate the effectiveness of "TimeTrail," a study is conducted on a diverse +financial dataset, surrounding various fraud scenarios. Results demonstrate the +technique's capability to uncover hidden temporal correlations and patterns, +performing better than conventional methods in both accuracy and +interpretability. Moreover, a case study showcasing the application of +"TimeTrail" in real-world scenarios highlights its utility for fraud detection. + +
+
+
+
+
+ + ☆ Predictive Sparse Manifold Transform ICML + + +
+ We present Predictive Sparse Manifold Transform (PSMT), a minimalistic, +interpretable and biologically plausible framework for learning and predicting +natural dynamics. PSMT incorporates two layers where the first sparse coding +layer represents the input sequence as sparse coefficients over an overcomplete +dictionary and the second manifold learning layer learns a geometric embedding +space that captures topological similarity and dynamic temporal linearity in +sparse coefficients. We apply PSMT on a natural video dataset and evaluate the +reconstruction performance with respect to contextual variability, the number +of sparse coding basis functions and training samples. We then interpret the +dynamic topological organization in the embedding space. We next utilize PSMT +to predict future frames compared with two baseline methods with a static +embedding space. We demonstrate that PSMT with a dynamic embedding space can +achieve better prediction performance compared to static baselines. Our work +establishes that PSMT is an efficient unsupervised generative framework for +prediction of future visual stimuli. + +
+
+ comment: Paper presented at the 1st Workshop on High-dimensional Learning + Dynamics (HLD) at the 40th International Conference on Machine Learning + (ICML) 2023, Honolulu, Hawaii, USA + (https://sites.google.com/view/hidimlearning), 10 pages +
+
+
+
+
+ + ☆ Score-Based Generative Models for PET Image Reconstruction + + +
+ Score-based generative models have demonstrated highly promising results for +medical image reconstruction tasks in magnetic resonance imaging or computed +tomography. However, their application to Positron Emission Tomography (PET) is +still largely unexplored. PET image reconstruction involves a variety of +challenges, including Poisson noise with high variance and a wide dynamic +range. To address these challenges, we propose several PET-specific adaptations +of score-based generative models. The proposed framework is developed for both +2D and 3D PET. In addition, we provide an extension to guided reconstruction +using magnetic resonance images. We validate the approach through extensive 2D +and 3D $\textit{in-silico}$ experiments with a model trained on +patient-realistic data without lesions, and evaluate on data without lesions as +well as out-of-distribution data with lesions. This demonstrates the proposed +method's robustness and significant potential for improved PET reconstruction. + +
+
+ comment: 35 pages, 16 figures, submitted to Journal of Machine Learning for + Biomedical Imaging (MELBA) +
+
+
+
+
+ + ☆ Topological Augmentation for Class-Imbalanced Node Classification + + +
+ Class imbalance is prevalent in real-world node classification tasks and +often biases graph learning models toward majority classes. Most existing +studies root from a node-centric perspective and aim to address the class +imbalance in training data by node/class-wise reweighting or resampling. In +this paper, we approach the source of the class-imbalance bias from an +under-explored topology-centric perspective. Our investigation reveals that +beyond the inherently skewed training class distribution, the graph topology +also plays an important role in the formation of predictive bias: we identify +two fundamental challenges, namely ambivalent and distant message-passing, that +can exacerbate the bias by aggravating majority-class over-generalization and +minority-class misclassification. In light of these findings, we devise a +lightweight topological augmentation method ToBA to dynamically rectify the +nodes influenced by ambivalent/distant message-passing during graph learning, +so as to mitigate the class-imbalance bias. We highlight that ToBA is a +model-agnostic, efficient, and versatile solution that can be seamlessly +combined with and further boost other imbalance-handling techniques. Systematic +experiments validate the superior performance of ToBA in both promoting +imbalanced node classification and mitigating the prediction bias between +different classes. + +
+
+ comment: 19 pages, 8 figures +
+
+
+
+
+ + ☆ Leveraging Linear Independence of Component Classifiers: Optimizing Size + and Prediction Accuracy for Online Ensembles + + +
+ Ensembles, which employ a set of classifiers to enhance classification +accuracy collectively, are crucial in the era of big data. However, although +there is general agreement that the relation between ensemble size and its +prediction accuracy, the exact nature of this relationship is still unknown. We +introduce a novel perspective, rooted in the linear independence of +classifier's votes, to analyze the interplay between ensemble size and +prediction accuracy. This framework reveals a theoretical link, consequently +proposing an ensemble size based on this relationship. Our study builds upon a +geometric framework and develops a series of theorems. These theorems clarify +the role of linear dependency in crafting ensembles. We present a method to +determine the minimum ensemble size required to ensure a target probability of +linearly independent votes among component classifiers. Incorporating real and +synthetic datasets, our empirical results demonstrate a trend: increasing the +number of classifiers enhances accuracy, as predicted by our theoretical +insights. However, we also identify a point of diminishing returns, beyond +which additional classifiers provide diminishing improvements in accuracy. +Surprisingly, the calculated ideal ensemble size deviates from empirical +results for certain datasets, emphasizing the influence of other factors. This +study opens avenues for deeper investigations into the complex dynamics +governing ensemble design and offers guidance for constructing efficient and +effective ensembles in practical scenarios. + +
+
+
+
+
+ + ☆ Integrated Approach of Gearbox Fault Diagnosis + + +
+ Gearbox fault diagnosis is one of the most important parts in any industrial +systems. Failure of components inside gearbox can lead to a catastrophic +failure, uneven breakdown, and financial losses in industrial organization. In +that case intelligent maintenance of the gearbox comes into context. This paper +presents an integrated gearbox fault diagnosis approach which can easily deploy +in online condition monitoring. This work introduces a nonparametric data +preprocessing technique i.e., calculus enhanced energy operator (CEEO) to +preserve the characteristics frequencies in the noisy and inferred vibrational +signal. A set of time domain and spectral domain features are calculated from +the raw and CEEO vibration signal and inputted to the multiclass support vector +machine (MCSVM) to diagnose the faults on the system. An effective comparison +between raw signal and CEEO signal are presented to show the impact of CEEO in +gearbox fault diagnosis. The obtained results of this work look very promising +and can be implemented in any type of industrial system due to its +nonparametric nature. + +
+
+
+
+
+ + ☆ Hypergraph Structure Inference From Data Under Smoothness Prior + + +
+ Hypergraphs are important for processing data with higher-order relationships +involving more than two entities. In scenarios where explicit hypergraphs are +not readily available, it is desirable to infer a meaningful hypergraph +structure from the node features to capture the intrinsic relations within the +data. However, existing methods either adopt simple pre-defined rules that fail +to precisely capture the distribution of the potential hypergraph structure, or +learn a mapping between hypergraph structures and node features but require a +large amount of labelled data, i.e., pre-existing hypergraph structures, for +training. Both restrict their applications in practical scenarios. To fill this +gap, we propose a novel smoothness prior that enables us to design a method to +infer the probability for each potential hyperedge without labelled data as +supervision. The proposed prior indicates features of nodes in a hyperedge are +highly correlated by the features of the hyperedge containing them. We use this +prior to derive the relation between the hypergraph structure and the node +features via probabilistic modelling. This allows us to develop an unsupervised +inference method to estimate the probability for each potential hyperedge via +solving an optimisation problem that has an analytical solution. Experiments on +both synthetic and real-world data demonstrate that our method can learn +meaningful hypergraph structures from data more efficiently than existing +hypergraph structure inference methods. + +
+
+
+
+
+ + ☆ Distributional Off-Policy Evaluation for Slate Recommendations + + +
+ Recommendation strategies are typically evaluated by using previously logged +data, employing off-policy evaluation methods to estimate their expected +performance. However, for strategies that present users with slates of multiple +items, the resulting combinatorial action space renders many of these methods +impractical. Prior work has developed estimators that leverage the structure in +slates to estimate the expected off-policy performance, but the estimation of +the entire performance distribution remains elusive. Estimating the complete +distribution allows for a more comprehensive evaluation of recommendation +strategies, particularly along the axes of risk and fairness that employ +metrics computable from the distribution. In this paper, we propose an +estimator for the complete off-policy performance distribution for slates and +establish conditions under which the estimator is unbiased and consistent. This +builds upon prior work on off-policy evaluation for slates and off-policy +distribution estimation in reinforcement learning. We validate the efficacy of +our method empirically on synthetic data as well as on a slate recommendation +simulator constructed from real-world data (MovieLens-20M). Our results show a +significant reduction in estimation variance and improved sample efficiency +over prior work across a range of slate structures. + +
+
+
+
+
+ + ☆ Explaining with Attribute-based and Relational Near Misses: An + Interpretable Approach to Distinguishing Facial Expressions of Pain and + Disgust + + +
+ Explaining concepts by contrasting examples is an efficient and convenient +way of giving insights into the reasons behind a classification decision. This +is of particular interest in decision-critical domains, such as medical +diagnostics. One particular challenging use case is to distinguish facial +expressions of pain and other states, such as disgust, due to high similarity +of manifestation. In this paper, we present an approach for generating +contrastive explanations to explain facial expressions of pain and disgust +shown in video sequences. We implement and compare two approaches for +contrastive explanation generation. The first approach explains a specific pain +instance in contrast to the most similar disgust instance(s) based on the +occurrence of facial expressions (attributes). The second approach takes into +account which temporal relations hold between intervals of facial expressions +within a sequence (relations). The input to our explanation generation approach +is the output of an interpretable rule-based classifier for pain and disgust.We +utilize two different similarity metrics to determine near misses and far +misses as contrasting instances. Our results show that near miss explanations +are shorter than far miss explanations, independent from the applied similarity +metric. The outcome of our evaluation indicates that pain and disgust can be +distinguished with the help of temporal relations. We currently plan +experiments to evaluate how the explanations help in teaching concepts and how +they could be enhanced by further modalities and interaction. + +
+
+
+
+
+ + ☆ Learning end-to-end inversion of circular Radon transforms in the + partial radial setup + + +
+ We present a deep learning-based computational algorithm for inversion of +circular Radon transforms in the partial radial setup, arising in photoacoustic +tomography. We first demonstrate that the truncated singular value +decomposition-based method, which is the only traditional algorithm available +to solve this problem, leads to severe artifacts which renders the +reconstructed field as unusable. With the objective of overcoming this +computational bottleneck, we train a ResBlock based U-Net to recover the +inferred field that directly operates on the measured data. Numerical results +with augmented Shepp-Logan phantoms, in the presence of noisy full and limited +view data, demonstrate the superiority of the proposed algorithm. + +
+
+
+
+
+ + ☆ Integrated Variational Fourier Features for Fast Spatial Modelling with + Gaussian Processes + + +
+ Sparse variational approximations are popular methods for scaling up +inference and learning in Gaussian processes to larger datasets. For $N$ +training points, exact inference has $O(N^3)$ cost; with $M \ll N$ features, +state of the art sparse variational methods have $O(NM^2)$ cost. Recently, +methods have been proposed using more sophisticated features; these promise +$O(M^3)$ cost, with good performance in low dimensional tasks such as spatial +modelling, but they only work with a very limited class of kernels, excluding +some of the most commonly used. In this work, we propose integrated Fourier +features, which extends these performance benefits to a very broad class of +stationary covariance functions. We motivate the method and choice of +parameters from a convergence analysis and empirical exploration, and show +practical speedup in synthetic and real world spatial regression tasks. + +
+
+
+
+
+ + ☆ Detecting Language Model Attacks with Perplexity + + +
+ A novel hack involving Large Language Models (LLMs) has emerged, leveraging +adversarial suffixes to trick models into generating perilous responses. This +method has garnered considerable attention from reputable media outlets such as +the New York Times and Wired, thereby influencing public perception regarding +the security and safety of LLMs. In this study, we advocate the utilization of +perplexity as one of the means to recognize such potential attacks. The +underlying concept behind these hacks revolves around appending an unusually +constructed string of text to a harmful query that would otherwise be blocked. +This maneuver confuses the protective mechanisms and tricks the model into +generating a forbidden response. Such scenarios could result in providing +detailed instructions to a malicious user for constructing explosives or +orchestrating a bank heist. Our investigation demonstrates the feasibility of +employing perplexity, a prevalent natural language processing metric, to detect +these adversarial tactics before generating a forbidden response. By evaluating +the perplexity of queries with and without such adversarial suffixes using an +open-source LLM, we discovered that nearly 90 percent were above a perplexity +of 1000. This contrast underscores the efficacy of perplexity for detecting +this type of exploit. + +
+
+
+
+
+ + ☆ SPEED: Streaming Partition and Parallel Acceleration for Temporal + Interaction Graph Embedding + + +
+ Temporal Interaction Graphs (TIGs) are widely employed to model intricate +real-world systems such as financial systems and social networks. To capture +the dynamism and interdependencies of nodes, existing TIG embedding models need +to process edges sequentially and chronologically. However, this requirement +prevents it from being processed in parallel and struggle to accommodate +burgeoning data volumes to GPU. Consequently, many large-scale temporal +interaction graphs are confined to CPU processing. Furthermore, a generalized +GPU scaling and acceleration approach remains unavailable. To facilitate +large-scale TIGs' implementation on GPUs for acceleration, we introduce a novel +training approach namely Streaming Edge Partitioning and Parallel Acceleration +for Temporal Interaction Graph Embedding (SPEED). The SPEED is comprised of a +Streaming Edge Partitioning Component (SEP) which addresses space overhead +issue by assigning fewer nodes to each GPU, and a Parallel Acceleration +Component (PAC) which enables simultaneous training of different sub-graphs, +addressing time overhead issue. Our method can achieve a good balance in +computing resources, computing time, and downstream task performance. Empirical +validation across 7 real-world datasets demonstrates the potential to expedite +training speeds by a factor of up to 19.29x. Simultaneously, resource +consumption of a single-GPU can be diminished by up to 69%, thus enabling the +multiple GPU-based training and acceleration encompassing millions of nodes and +billions of edges. Furthermore, our approach also maintains its competitiveness +in downstream tasks. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap +and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT CI without specific guidance. ChatGPT CI autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT CI offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ☆ Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario + + +
+ Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to +improve model performance. Traditional SSL methods assume that labeled and +unlabeled data share the same label space. However, in real-world applications, +especially when the labeled training set is small, there may be classes that +are missing from the labeled set. Existing frameworks aim to either reject all +unseen classes (open-set SSL) or to discover unseen classes by partitioning an +unlabeled set during training (open-world SSL). In our work, we construct a +classifier for points from both seen and unseen classes. Our approach is based +on extending an existing SSL method, such as FlexMatch, by incorporating an +additional entropy loss. This enhancement allows our method to improve the +performance of any existing SSL method in the classification of both seen and +unseen classes. We demonstrate large improvement gains over state-of-the-art +SSL, open-set SSL, and open-world SSL methods, on two benchmark image +classification data sets, CIFAR-100 and STL-10. The gains are most pronounced +when the labeled data is severely limited (1-25 labeled examples per class). + +
+
+
+
+
+ + ☆ Hybrid Transformer-RNN Architecture for Household Occupancy Detection + Using Low-Resolution Smart Meter Data + + +
+ Residential occupancy detection has become an enabling technology in today's +urbanized world for various smart home applications, such as building +automation, energy management, and improved security and comfort. +Digitalization of the energy system provides smart meter data that can be used +for occupancy detection in a non-intrusive manner without causing concerns +regarding privacy and data security. In particular, deep learning techniques +make it possible to infer occupancy from low-resolution smart meter data, such +that the need for accurate occupancy detection with privacy preservation can be +achieved. Our work is thus motivated to develop a privacy-aware and effective +model for residential occupancy detection in contemporary living environments. +Our model aims to leverage the advantages of both recurrent neural networks +(RNNs), which are adept at capturing local temporal dependencies, and +transformers, which are effective at handling global temporal dependencies. Our +designed hybrid transformer-RNN model detects residential occupancy using +hourly smart meter data, achieving an accuracy of nearly 92\% across households +with diverse profiles. We validate the effectiveness of our method using a +publicly accessible dataset and demonstrate its performance by comparing it +with state-of-the-art models, including attention-based occupancy detection +methods. + +
+
+ comment: IEEE IECON 2023 (The 49th Annual Conference of the IEEE Industrial + Electronics Society) +
+
+
+
+
+ + ☆ Depth self-supervision for single image novel view synthesis + + +
+ In this paper, we tackle the problem of generating a novel image from an +arbitrary viewpoint given a single frame as input. While existing methods +operating in this setup aim at predicting the target view depth map to guide +the synthesis, without explicit supervision over such a task, we jointly +optimize our framework for both novel view synthesis and depth estimation to +unleash the synergy between the two at its best. Specifically, a shared depth +decoder is trained in a self-supervised manner to predict depth maps that are +consistent across the source and target views. Our results demonstrate the +effectiveness of our approach in addressing the challenges of both tasks +allowing for higher-quality generated images, as well as more accurate depth +for the target viewpoint. + +
+
+
+
+
+ + ☆ Towards Generalizable Neural Solvers for Vehicle Routing Problems via + Ensemble with Transferrable Local Policy + + +
+ Machine learning has been adapted to help solve NP-hard combinatorial +optimization problems. One prevalent way is learning to construct solutions by +deep neural networks, which has been receiving more and more attention due to +the high efficiency and less requirement for expert knowledge. However, many +neural construction methods for Vehicle Routing Problems (VRPs) focus on +synthetic problem instances with limited scales and specified node +distributions, leading to poor performance on real-world problems which usually +involve large scales together with complex and unknown node distributions. To +make neural VRP solvers more practical in real-world scenarios, we design an +auxiliary policy that learns from the local transferable topological features, +named local policy, and integrate it with a typical constructive policy (which +learns from the global information of VRP instances) to form an ensemble +policy. With joint training, the aggregated policies perform cooperatively and +complementarily to boost generalization. The experimental results on two +well-known benchmarks, TSPLIB and CVRPLIB, of travelling salesman problem and +capacitated VRP show that the ensemble policy consistently achieves better +generalization than state-of-the-art construction methods and even works well +on real-world problems with several thousand nodes. + +
+
+
+
+
+ + ☆ The inverse problem for neural networks + + +
+ We study the problem of computing the preimage of a set under a neural +network with piecewise-affine activation functions. We recall an old result +that the preimage of a polyhedral set is again a union of polyhedral sets and +can be effectively computed. We show several applications of computing the +preimage for analysis and interpretability of neural networks. + +
+
+
+
+
+ + ☆ MedAlign: A Clinician-Generated Dataset for Instruction Following with + Electronic Medical Records + + +
+ The ability of large language models (LLMs) to follow natural language +instructions with human-level fluency suggests many opportunities in healthcare +to reduce administrative burden and improve quality of care. However, +evaluating LLMs on realistic text generation tasks for healthcare remains +challenging. Existing question answering datasets for electronic health record +(EHR) data fail to capture the complexity of information needs and +documentation burdens experienced by clinicians. To address these challenges, +we introduce MedAlign, a benchmark dataset of 983 natural language instructions +for EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes +clinician-written reference responses for 303 instructions, and provides 276 +longitudinal EHRs for grounding instruction-response pairs. We used MedAlign to +evaluate 6 general domain LLMs, having clinicians rank the accuracy and quality +of each LLM response. We found high error rates, ranging from 35% (GPT-4) to +68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k +context lengths for GPT-4. Finally, we report correlations between clinician +rankings and automated natural language generation metrics as a way to rank +LLMs without human review. We make MedAlign available under a research data use +agreement to enable LLM evaluations on tasks aligned with clinician needs and +preferences. + +
+
+
+
+
+ + ☆ Sampling with flows, diffusion and autoregressive neural networks: A + spin-glass perspective + + +
+ Recent years witnessed the development of powerful generative models based on +flows, diffusion or autoregressive neural networks, achieving remarkable +success in generating data from examples with applications in a broad range of +areas. A theoretical analysis of the performance and understanding of the +limitations of these methods remain, however, challenging. In this paper, we +undertake a step in this direction by analysing the efficiency of sampling by +these methods on a class of problems with a known probability distribution and +comparing it with the sampling performance of more traditional methods such as +the Monte Carlo Markov chain and Langevin dynamics. We focus on a class of +probability distribution widely studied in the statistical physics of +disordered systems that relate to spin glasses, statistical inference and +constraint satisfaction problems. + We leverage the fact that sampling via flow-based, diffusion-based or +autoregressive networks methods can be equivalently mapped to the analysis of a +Bayes optimal denoising of a modified probability measure. Our findings +demonstrate that these methods encounter difficulties in sampling stemming from +the presence of a first-order phase transition along the algorithm's denoising +path. Our conclusions go both ways: we identify regions of parameters where +these methods are unable to sample efficiently, while that is possible using +standard Monte Carlo or Langevin approaches. We also identify regions where the +opposite happens: standard approaches are inefficient while the discussed +generative methods work well. + +
+
+ comment: 39 pages, 12 figures +
+
+
+
+
+ + ☆ Pruning the Unlabeled Data to Improve Semi-Supervised Learning + + +
+ In the domain of semi-supervised learning (SSL), the conventional approach +involves training a learner with a limited amount of labeled data alongside a +substantial volume of unlabeled data, both drawn from the same underlying +distribution. However, for deep learning models, this standard practice may not +yield optimal results. In this research, we propose an alternative perspective, +suggesting that distributions that are more readily separable could offer +superior benefits to the learner as compared to the original distribution. To +achieve this, we present PruneSSL, a practical technique for selectively +removing examples from the original unlabeled dataset to enhance its +separability. We present an empirical study, showing that although PruneSSL +reduces the quantity of available training data for the learner, it +significantly improves the performance of various competitive SSL algorithms, +thereby achieving state-of-the-art results across several image classification +tasks. + +
+
+
+
+
+ + ♻ ☆ Learning Melanocytic Cell Masks from Adjacent Stained Tissue MICCAI + 2022 + + +
+ Melanoma is one of the most aggressive forms of skin cancer, causing a large +proportion of skin cancer deaths. However, melanoma diagnoses by pathologists +shows low interrater reliability. As melanoma is a cancer of the melanocyte, +there is a clear need to develop a melanocytic cell segmentation tool that is +agnostic to pathologist variability and automates pixel-level annotation. +Gigapixel-level pathologist labeling, however, is impractical. Herein, we +propose a means to train deep neural networks for melanocytic cell segmentation +from hematoxylin and eosin (H&E) stained slides using paired +immunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean +IOU of 0.64 despite imperfect ground-truth labels. + +
+
+ comment: {Medical Image Learning with Limited & Noisy Data Workshop at MICCAI + 2022 +
+
+
+
+
+ + ♻ ☆ How to choose the most appropriate centrality measure? A decision tree + approach + + +
+ Centrality metrics play a crucial role in network analysis, while the choice +of specific measures significantly influences the accuracy of conclusions as +each measure represents a unique concept of node importance. Among over 400 +proposed indices, selecting the most suitable ones for specific applications +remains a challenge. Existing approaches -- model-based, data-driven, and +axiomatic -- have limitations, requiring association with models, training +datasets, or restrictive axioms for each specific application. To address this, +we introduce the culling method, which relies on the expert concept of +centrality behavior on simple graphs. The culling method involves forming a set +of candidate measures, generating a list of as small graphs as possible needed +to distinguish the measures from each other, constructing a decision-tree +survey, and identifying the measure consistent with the expert's concept. We +apply this approach to a diverse set of 40 centralities, including novel +kernel-based indices, and combine it with the axiomatic approach. Remarkably, +only 13 small 1-trees are sufficient to separate all 40 measures, even for +pairs of closely related ones. By adopting simple ordinal axioms like +Self-consistency or Bridge axiom, the set of measures can be drastically +reduced making the culling survey short. Applying the culling method provides +insightful findings on some centrality indices, such as PageRank, Bridging, and +dissimilarity-based Eigencentrality measures, among others. The proposed +approach offers a cost-effective solution in terms of labor and time, +complementing existing methods for measure selection, and providing deeper +insights into the underlying mechanisms of centrality measures. + +
+
+ comment: 12 pages, 2 tables, 1 algorithm, 8 figures. Presentation has been + improved +
+
+
+
+
+ + ♻ ☆ Practical Batch Bayesian Sampling Algorithms for Online Adaptive Traffic + Experimentation + + +
+ Online controlled experiments have emerged as industry gold standard for +assessing new web features. As new web algorithms proliferate, experimentation +platform faces an increasing demand on the velocity of online experiments, +which encourages adaptive traffic testing methods to speed up identifying best +variant by efficiently allocating traffic. This paper proposed four Bayesian +batch bandit algorithms (NB-TS, WB-TS, NB-TTTS, WB-TTTS) for eBay's +experimentation platform, using summary batch statistics of a goal metric +without incurring new engineering technical debts. The novel WB-TTTS, in +particular, demonstrates as an efficient, trustworthy and robust alternative to +fixed horizon A/B testing. Another novel contribution is to bring +trustworthiness of best arm identification algorithms into evaluation criterion +and highlight the existence of severe false positive inflation with equivalent +best arms. To gain the trust of experimenters, the experimentation platform +must consider both efficiency and trustworthiness; However, to the best of +authors' knowledge, trustworthiness as an important topic is rarely discussed +in literatures of either best arm identification or multi-armed bandit. This +paper shows that Bayesian bandits without neutral posterior reshaping, +particularly naive Thompson sampling (NB-TS), are untrustworthy because they +can always identify an arm as best from equivalent best arms. To restore +trustworthiness, a novel finding uncovers connections between convergence +distribution of posterior optimal probabilities of equivalent best arms and +neutral posterior reshaping, which controls false positives. Lastly, this paper +presents lessons learned from eBay's experience, as well as evaluations of the +four algorithms. We hope our work is useful to other industrial practitioners +and inspire academic researchers interested in the trustworthiness of adaptive +traffic experimentation. + +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence for Automatic Detection and Classification + Disease on the X-Ray Images + + +
+ Detecting and classifying diseases using X-ray images is one of the more +challenging core tasks in the medical and research world. Due to the recent +high interest in radiological images and AI, early detection of diseases in +X-ray images has become notably more essential to prevent further spreading and +flatten the curve. Innovations and revolutions of Computer Vision with Deep +learning methods offer great promise for fast and accurate diagnosis of +screening and detection from chest X-ray images (CXR). This work presents rapid +detection of diseases in the lung using the efficient Deep learning pre-trained +RepVGG algorithm for deep feature extraction and classification. We used X-ray +images as an example to show the model's efficiency. To perform this task, we +classify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ +ROI object to improve the detection accuracy for lung extraction, followed by +data pre-processing and augmentation. We are applying Artificial Intelligence +technology for automatic highlighted detection of affected areas of people's +lungs. Based on the X-Ray images, an algorithm was developed that classifies +X-Ray images with height accuracy and power faster thanks to the architecture +transformation of the model. We compared deep learning frameworks' accuracy and +detection of disease. The study shows the high power of deep learning methods +for X-ray images based on COVID-19 detection utilizing chest X-rays. The +proposed framework offers better diagnostic accuracy by comparing popular deep +learning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and +InceptionResnetV2. + +
+
+
+
+
+ + ♻ ☆ Local Context-Aware Active Domain Adaptation ICCV 2023 + + +
+ Active Domain Adaptation (ADA) queries the labels of a small number of +selected target samples to help adapting a model from a source domain to a +target domain. The local context of queried data is important, especially when +the domain gap is large. However, this has not been fully explored by existing +ADA works. In this paper, we propose a Local context-aware ADA framework, named +LADA, to address this issue. To select informative target samples, we devise a +novel criterion based on the local inconsistency of model predictions. Since +the labeling budget is usually small, fine-tuning model on only queried data +can be inefficient. We progressively augment labeled target data with the +confident neighbors in a class-balanced manner. Experiments validate that the +proposed criterion chooses more informative target samples than existing active +selection strategies. Furthermore, our full method clearly surpasses recent ADA +arts on various benchmarks. Code is available at https://github.com/tsun/LADA. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ ReCo: A Dataset for Residential Community Layout Planning + + +
+ Layout planning is centrally important in the field of architecture and urban +design. Among the various basic units carrying urban functions, residential +community plays a vital part for supporting human life. Therefore, the layout +planning of residential community has always been of concern, and has attracted +particular attention since the advent of deep learning that facilitates the +automated layout generation and spatial pattern recognition. However, the +research circles generally suffer from the insufficiency of residential +community layout benchmark or high-quality datasets, which hampers the future +exploration of data-driven methods for residential community layout planning. +The lack of datasets is largely due to the difficulties of large-scale +real-world residential data acquisition and long-term expert screening. In +order to address the issues and advance a benchmark dataset for various +intelligent spatial design and analysis applications in the development of +smart city, we introduce Residential Community Layout Planning (ReCo) Dataset, +which is the first and largest open-source vector dataset related to real-world +community to date. ReCo Dataset is presented in multiple data formats with +37,646 residential community layout plans, covering 598,728 residential +buildings with height information. ReCo can be conveniently adapted for +residential community layout related urban design tasks, e.g., generative +layout design, morphological pattern recognition and spatial evaluation. To +validate the utility of ReCo in automated residential community layout +planning, two Generative Adversarial Network (GAN) based generative models are +further applied to the dataset. We expect ReCo Dataset to inspire more creative +and practical work in intelligent design and beyond. The ReCo Dataset is +published at: https://www.kaggle.com/fdudsde/reco-dataset. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ A Survey of Safety and Trustworthiness of Large Language Models through + the Lens of Verification and Validation + + +
+ Large Language Models (LLMs) have exploded a new heatwave of AI for their +ability to engage end-users in human-level conversations with detailed and +articulate answers across many knowledge domains. In response to their fast +adoption in many industrial applications, this survey concerns their safety and +trustworthiness. First, we review known vulnerabilities and limitations of the +LLMs, categorising them into inherent issues, attacks, and unintended bugs. +Then, we consider if and how the Verification and Validation (V&V) techniques, +which have been widely developed for traditional software and deep learning +models such as convolutional neural networks as independent processes to check +the alignment of their implementations against the specifications, can be +integrated and further extended throughout the lifecycle of the LLMs to provide +rigorous analysis to the safety and trustworthiness of LLMs and their +applications. Specifically, we consider four complementary techniques: +falsification and evaluation, verification, runtime monitoring, and regulations +and ethical use. In total, 370+ references are considered to support the quick +understanding of the safety and trustworthiness issues from the perspective of +V&V. While intensive research has been conducted to identify the safety and +trustworthiness issues, rigorous yet practical methods are called for to ensure +the alignment of LLMs with safety and trustworthiness requirements. + +
+
+
+
+
+ + ♻ ☆ PMU measurements based short-term voltage stability assessment of power + systems via deep transfer learning + + +
+ Deep learning has emerged as an effective solution for addressing the +challenges of short-term voltage stability assessment (STVSA) in power systems. +However, existing deep learning-based STVSA approaches face limitations in +adapting to topological changes, sample labeling, and handling small datasets. +To overcome these challenges, this paper proposes a novel phasor measurement +unit (PMU) measurements-based STVSA method by using deep transfer learning. The +method leverages the real-time dynamic information captured by PMUs to create +an initial dataset. It employs temporal ensembling for sample labeling and +utilizes least squares generative adversarial networks (LSGAN) for data +augmentation, enabling effective deep learning on small-scale datasets. +Additionally, the method enhances adaptability to topological changes by +exploring connections between different faults. Experimental results on the +IEEE 39-bus test system demonstrate that the proposed method improves model +evaluation accuracy by approximately 20% through transfer learning, exhibiting +strong adaptability to topological changes. Leveraging the self-attention +mechanism of the Transformer model, this approach offers significant advantages +over shallow learning methods and other deep learning-based approaches. + +
+
+ comment: Accepted by IEEE Transactions on Instrumentation & Measurement +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Computation-efficient Deep Learning for Computer Vision: A Survey + + +
+ Over the past decade, deep learning models have exhibited considerable +advancements, reaching or even exceeding human-level performance in a range of +visual perception tasks. This remarkable progress has sparked interest in +applying deep networks to real-world applications, such as autonomous vehicles, +mobile devices, robotics, and edge computing. However, the challenge remains +that state-of-the-art models usually demand significant computational +resources, leading to impractical power consumption, latency, or carbon +emissions in real-world scenarios. This trade-off between effectiveness and +efficiency has catalyzed the emergence of a new research focus: computationally +efficient deep learning, which strives to achieve satisfactory performance +while minimizing the computational cost during inference. This review offers an +extensive analysis of this rapidly evolving field by examining four key areas: +1) the development of static or dynamic light-weighted backbone models for the +efficient extraction of discriminative deep representations; 2) the specialized +network architectures or algorithms tailored for specific computer vision +tasks; 3) the techniques employed for compressing deep learning models; and 4) +the strategies for deploying efficient deep networks on hardware platforms. +Additionally, we provide a systematic discussion on the critical challenges +faced in this domain, such as network architecture design, training schemes, +practical efficiency, and more realistic model compression approaches, as well +as potential future research directions. + +
+
+
+
+
+ + ♻ ☆ VATP360: Viewport Adaptive 360-Degree Video Streaming based on Tile + Priority + + +
+ 360-degree video becomes increasingly popular among users. In the current +network bandwidth, serving high resolution 360 degree video to users is quite +difficult. Most of the work has been devoted to the prediction of user +viewports or tile-based adaptive algorithms. However, it is difficult to +predict user viewports more accurately using only information such as user's +historical viewports or video saliency maps. In this paper, we propose a +viewport adaptive 360-degree video streaming method based on tile priority +(VATP360), which tries to balance between the performance and the overhead. The +proposed VATP360 consists of three main modules: viewport prediction, tile +priority classification and bitrate allocation. In the viewport prediction +module, object motion trajectory and predicted user's region-of-interest (ROI) +are used to achieve accurate prediction of the user's future viewport. Then, +the predicted viewport, along with the object motion trajectory, are fed into +the proposed tile priority classification algorithm to assign different +priorities to tiles, which would reduce the computational complexity of the +bitrate allocation module. Finally in the bitrate allocation stage, we +adaptively assign bitrates to tiles of different priority by reinforcement +learning. Experimental results on publicly available datasets have demonstrated +the effectiveness of the proposed method. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 22 + +
+
+
+ + ☆ Translate Meanings, Not Just Words: IdiomKB's Role in Optimizing + Idiomatic Translation with Language Models + + +
+ To translate well, machine translation (MT) systems and general-purposed +language models (LMs) need a deep understanding of both source and target +languages and cultures. Therefore, idioms, with their non-compositional nature, +pose particular challenges for Transformer-based systems, as literal +translations often miss the intended meaning. Traditional methods, which +replace idioms using existing knowledge bases (KBs), often lack scale and +context awareness. Addressing these challenges, our approach prioritizes +context awareness and scalability, allowing for offline storage of idioms in a +manageable KB size. This ensures efficient serving with smaller models and +provides a more comprehensive understanding of idiomatic expressions. We +introduce a multilingual idiom KB (IdiomKB) developed using large LMs to +address this. This KB facilitates better translation by smaller models, such as +BLOOMZ (7.1B), Alpaca (7B), and InstructGPT (6.7B), by retrieving idioms' +figurative meanings. We present a novel, GPT-4-powered metric for human-aligned +evaluation, demonstrating that IdiomKB considerably boosts model performance. +Human evaluations further validate our KB's quality. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Improving Knowledge Distillation for BERT Models: Loss Functions, + Mapping Methods, and Weight Tuning + + +
+ The use of large transformer-based models such as BERT, GPT, and T5 has led +to significant advancements in natural language processing. However, these +models are computationally expensive, necessitating model compression +techniques that reduce their size and complexity while maintaining accuracy. +This project investigates and applies knowledge distillation for BERT model +compression, specifically focusing on the TinyBERT student model. We explore +various techniques to improve knowledge distillation, including experimentation +with loss functions, transformer layer mapping methods, and tuning the weights +of attention and representation loss and evaluate our proposed techniques on a +selection of downstream tasks from the GLUE benchmark. The goal of this work is +to improve the efficiency and effectiveness of knowledge distillation, enabling +the development of more efficient and accurate models for a range of natural +language processing tasks. + +
+
+
+
+
+ + ☆ Exploring Large Language Models for Knowledge Graph Completion + + +
+ Knowledge graphs play a vital role in numerous artificial intelligence tasks, +yet they frequently face the issue of incompleteness. In this study, we explore +utilizing Large Language Models (LLM) for knowledge graph completion. We +consider triples in knowledge graphs as text sequences and introduce an +innovative framework called Knowledge Graph LLM (KG-LLM) to model these +triples. Our technique employs entity and relation descriptions of a triple as +prompts and utilizes the response for predictions. Experiments on various +benchmark knowledge graphs demonstrate that our method attains state-of-the-art +performance in tasks such as triple classification and relation prediction. We +also find that fine-tuning relatively smaller models (e.g., LLaMA-7B, +ChatGLM-6B) outperforms recent ChatGPT and GPT-4. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ A Wide Evaluation of ChatGPT on Affective Computing Tasks + + +
+ With the rise of foundation models, a new artificial intelligence paradigm +has emerged, by simply using general purpose foundation models with prompting +to solve problems instead of training a separate machine learning model for +each problem. Such models have been shown to have emergent properties of +solving problems that they were not initially trained on. The studies for the +effectiveness of such models are still quite limited. In this work, we widely +study the capabilities of the ChatGPT models, namely GPT-4 and GPT-3.5, on 13 +affective computing problems, namely aspect extraction, aspect polarity +classification, opinion extraction, sentiment analysis, sentiment intensity +ranking, emotions intensity ranking, suicide tendency detection, toxicity +detection, well-being assessment, engagement measurement, personality +assessment, sarcasm detection, and subjectivity detection. We introduce a +framework to evaluate the ChatGPT models on regression-based problems, such as +intensity ranking problems, by modelling them as pairwise ranking +classification. We compare ChatGPT against more traditional NLP methods, such +as end-to-end recurrent neural networks and transformers. The results +demonstrate the emergent abilities of the ChatGPT models on a wide range of +affective computing problems, where GPT-3.5 and especially GPT-4 have shown +strong performance on many problems, particularly the ones related to +sentiment, emotions, or toxicity. The ChatGPT models fell short for problems +with implicit signals, such as engagement measurement and subjectivity +detection. + +
+
+ comment: 8 pages with references, 2 tables +
+
+
+
+
+ + ☆ LMSanitator: Defending Prompt-Tuning Against Task-Agnostic Backdoors NDSS + + +
+ Prompt-tuning has emerged as an attractive paradigm for deploying large-scale +language models due to its strong downstream task performance and efficient +multitask serving ability. Despite its wide adoption, we empirically show that +prompt-tuning is vulnerable to downstream task-agnostic backdoors, which reside +in the pretrained models and can affect arbitrary downstream tasks. The +state-of-the-art backdoor detection approaches cannot defend against +task-agnostic backdoors since they hardly converge in reversing the backdoor +triggers. To address this issue, we propose LMSanitator, a novel approach for +detecting and removing task-agnostic backdoors on Transformer models. Instead +of directly inversing the triggers, LMSanitator aims to inverse the predefined +attack vectors (pretrained models' output when the input is embedded with +triggers) of the task-agnostic backdoors, which achieves much better +convergence performance and backdoor detection accuracy. LMSanitator further +leverages prompt-tuning's property of freezing the pretrained model to perform +accurate and fast output monitoring and input purging during the inference +phase. Extensive experiments on multiple language models and NLP tasks +illustrate the effectiveness of LMSanitator. For instance, LMSanitator achieves +92.8% backdoor detection accuracy on 960 models and decreases the attack +success rate to less than 1% in most scenarios. + +
+
+ comment: To Appear in the Network and Distributed System Security (NDSS) + Symposium 2024, 26 February - 1 March 2024, San Diego, CA, USA +
+
+
+
+
+ + ☆ Solving Math Word Problem with Problem Type Classification NLPCC2023 + + +
+ Math word problems (MWPs) require analyzing text descriptions and generating +mathematical equations to derive solutions. Existing works focus on solving +MWPs with two types of solvers: tree-based solver and large language model +(LLM) solver. However, these approaches always solve MWPs by a single solver, +which will bring the following problems: (1) Single type of solver is hard to +solve all types of MWPs well. (2) A single solver will result in poor +performance due to over-fitting. To address these challenges, this paper +utilizes multiple ensemble approaches to improve MWP-solving ability. Firstly, +We propose a problem type classifier that combines the strengths of the +tree-based solver and the LLM solver. This ensemble approach leverages their +respective advantages and broadens the range of MWPs that can be solved. +Furthermore, we also apply ensemble techniques to both tree-based solver and +LLM solver to improve their performance. For the tree-based solver, we propose +an ensemble learning framework based on ten-fold cross-validation and voting +mechanism. In the LLM solver, we adopt self-consistency (SC) method to improve +answer selection. Experimental results demonstrate the effectiveness of these +ensemble approaches in enhancing MWP-solving ability. The comprehensive +evaluation showcases improved performance, validating the advantages of our +proposed approach. Our code is available at this url: +https://github.com/zhouzihao501/NLPCC2023-Shared-Task3-ChineseMWP. + +
+
+ comment: Accpected by NLPCC2023 +
+
+
+
+
+ + ☆ Planning with Logical Graph-based Language Model for Instruction + Generation + + +
+ Despite the superior performance of large language models to generate natural +language texts, it is hard to generate texts with correct logic according to a +given task, due to the difficulties for neural models to capture implied rules +from free-form texts. In this paper, we propose a novel graph-based language +model, Logical-GLM, to infuse logic into language models for more valid text +generation and interpretability. Specifically, we first capture information +from natural language instructions and construct logical bayes graphs that +generally describe domains. Next, we generate logical skeletons to guide +language model training, infusing domain knowledge into language models. +Finally, we alternately optimize the searching policy of graphs and language +models until convergence. The experimental results show that Logical-GLM is +both effective and efficient compared with traditional language models, despite +using smaller-scale training data and fewer parameters. Our approach can +generate instructional texts with more correct logic owing to the internalized +domain knowledge. Moreover, the usage of logical graphs reflects the inner +mechanism of the language models, which improves the interpretability of +black-box models. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ EditSum: A Retrieve-and-Edit Framework for Source Code Summarization + + +
+ Existing studies show that code summaries help developers understand and +maintain source code. Unfortunately, these summaries are often missing or +outdated in software projects. Code summarization aims to generate natural +language descriptions automatically for source code. Code summaries are highly +structured and have repetitive patterns. Besides the patternized words, a code +summary also contains important keywords, which are the key to reflecting the +functionality of the code. However, the state-of-the-art approaches perform +poorly on predicting the keywords, which leads to the generated summaries +suffering a loss in informativeness. To alleviate this problem, this paper +proposes a novel retrieve-and-edit approach named EditSum for code +summarization. Specifically, EditSum first retrieves a similar code snippet +from a pre-defined corpus and treats its summary as a prototype summary to +learn the pattern. Then, EditSum edits the prototype automatically to combine +the pattern in the prototype with the semantic information of input code. Our +motivation is that the retrieved prototype provides a good start-point for +post-generation because the summaries of similar code snippets often have the +same pattern. The post-editing process further reuses the patternized words in +the prototype and generates keywords based on the semantic information of input +code. We conduct experiments on a large-scale Java corpus and experimental +results demonstrate that EditSum outperforms the state-of-the-art approaches by +a substantial margin. The human evaluation also proves the summaries generated +by EditSum are more informative and useful. We also verify that EditSum +performs well on predicting the patternized words and keywords. + +
+
+ comment: Accepted by the 36th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2021) +
+
+
+
+
+ + ☆ Adversarial Fine-Tuning of Language Models: An Iterative Optimisation + Approach for the Generation and Detection of Problematic Content + + +
+ In this paper, we tackle the emerging challenge of unintended harmful content +generation in Large Language Models (LLMs) with a novel dual-stage optimisation +technique using adversarial fine-tuning. Our two-pronged approach employs an +adversarial model, fine-tuned to generate potentially harmful prompts, and a +judge model, iteratively optimised to discern these prompts. In this +adversarial cycle, the two models seek to outperform each other in the +prompting phase, generating a dataset of rich examples which are then used for +fine-tuning. This iterative application of prompting and fine-tuning allows +continuous refinement and improved performance. The performance of our approach +is evaluated through classification accuracy on a dataset consisting of +problematic prompts not detected by GPT-4, as well as a selection of +contentious but unproblematic prompts. We show considerable increase in +classification accuracy of the judge model on this challenging dataset as it +undergoes the optimisation process. Furthermore, we show that a rudimentary +model \texttt{ada} can achieve 13\% higher accuracy on the hold-out test set +than GPT-4 after only a few rounds of this process, and that this fine-tuning +improves performance in parallel tasks such as toxic comment identification. + +
+
+
+
+
+ + ☆ How Can Context Help? Exploring Joint Retrieval of Passage and + Personalized Context + + +
+ The integration of external personalized context information into +document-grounded conversational systems has significant potential business +value, but has not been well-studied. Motivated by the concept of personalized +context-aware document-grounded conversational systems, we introduce the task +of context-aware passage retrieval. We also construct a dataset specifically +curated for this purpose. We describe multiple baseline systems to address this +task, and propose a novel approach, Personalized Context-Aware Search (PCAS), +that effectively harnesses contextual information during passage retrieval. +Experimental evaluations conducted on multiple popular dense retrieval systems +demonstrate that our proposed approach not only outperforms the baselines in +retrieving the most relevant passage but also excels at identifying the +pertinent context among all the available contexts. We envision that our +contributions will serve as a catalyst for inspiring future research endeavors +in this promising direction. + +
+
+
+
+
+ + ☆ ZC3: Zero-Shot Cross-Language Code Clone Detection + + +
+ Developers introduce code clones to improve programming productivity. Many +existing studies have achieved impressive performance in monolingual code clone +detection. However, during software development, more and more developers write +semantically equivalent programs with different languages to support different +platforms and help developers translate projects from one language to another. +Considering that collecting cross-language parallel data, especially for +low-resource languages, is expensive and time-consuming, how designing an +effective cross-language model that does not rely on any parallel data is a +significant problem. In this paper, we propose a novel method named ZC3 for +Zero-shot Cross-language Code Clone detection. ZC3 designs the contrastive +snippet prediction to form an isomorphic representation space among different +programming languages. Based on this, ZC3 exploits domain-aware learning and +cycle consistency learning to further constrain the model to generate +representations that are aligned among different languages meanwhile are +diacritical for different types of clones. To evaluate our approach, we conduct +extensive experiments on four representative cross-language clone detection +datasets. Experimental results show that ZC3 outperforms the state-of-the-art +baselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively. +We further investigate the representational distribution of different languages +and discuss the effectiveness of our method. + +
+
+ comment: Accepted by the 38th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2023) +
+
+
+
+
+ + ☆ On Philomatics and Psychomatics for Combining Philosophy and Psychology + with Mathematics + + +
+ We propose the concepts of philomatics and psychomatics as hybrid +combinations of philosophy and psychology with mathematics. We explain four +motivations for this combination which are fulfilling the desire of analytical +philosophy, proposing science of philosophy, justifying mathematical algorithms +by philosophy, and abstraction in both philosophy and mathematics. We enumerate +various examples for philomatics and psychomatics, some of which are explained +in more depth. The first example is the analysis of relation between the +context principle, semantic holism, and the usage theory of meaning with the +attention mechanism in mathematics. The other example is on the relations of +Plato's theory of forms in philosophy with the holographic principle in string +theory, object-oriented programming, and machine learning. Finally, the +relation between Wittgenstein's family resemblance and clustering in +mathematics is explained. This paper opens the door of research for combining +philosophy and psychology with mathematics. + +
+
+
+
+
+ + ☆ A Computational Evaluation Framework for Singable Lyric Translation + + +
+ Lyric translation plays a pivotal role in amplifying the global resonance of +music, bridging cultural divides, and fostering universal connections. +Translating lyrics, unlike conventional translation tasks, requires a delicate +balance between singability and semantics. In this paper, we present a +computational framework for the quantitative evaluation of singable lyric +translation, which seamlessly integrates musical, linguistic, and cultural +dimensions of lyrics. Our comprehensive framework consists of four metrics that +measure syllable count distance, phoneme repetition similarity, musical +structure distance, and semantic similarity. To substantiate the efficacy of +our framework, we collected a singable lyrics dataset, which precisely aligns +English, Japanese, and Korean lyrics on a line-by-line and section-by-section +basis, and conducted a comparative analysis between singable and non-singable +lyrics. Our multidisciplinary approach provides insights into the key +components that underlie the art of lyric translation and establishes a solid +groundwork for the future of computational lyric translation assessment. + +
+
+ comment: ISMIR 2023 +
+
+
+
+
+ + ♻ ☆ External Reasoning: Towards Multi-Large-Language-Models Interchangeable + Assistance with Human Feedback + + +
+ Memory is identified as a crucial human faculty that allows for the retention +of visual and linguistic information within the hippocampus and neurons in the +brain, which can subsequently be retrieved to address real-world challenges +that arise through a lifetime of learning. The resolution of complex AI tasks +through the application of acquired knowledge represents a stride toward the +realization of artificial general intelligence. However, despite the prevalence +of Large Language Models (LLMs) like GPT-3.5 and GPT-4 \cite{brown2020language, +leiter2023chatgpt, zaitsu2023distinguishing, OpenAI2023GPT4TR} , which have +displayed remarkable capabilities in language comprehension, generation, +interaction, and reasoning, they are inhibited by constraints on context length +that preclude the processing of extensive, continually evolving knowledge +bases. This paper proposes that LLMs could be augmented through the selective +integration of knowledge from external repositories, and in doing so, +introduces a novel methodology for External Reasoning, exemplified by ChatPDF. +Central to this approach is the establishment of a tiered policy for +\textbf{External Reasoning based on Multiple LLM Interchange Assistance} in +\cref{fig:overall}, where the level of support rendered is modulated across +entry, intermediate, and advanced tiers based on the complexity of the query, +with adjustments made in response to human feedback. A comprehensive evaluation +of this methodology is conducted using multiple LLMs and the results indicate +state-of-the-art performance in \cref{comparison} , surpassing existing +solutions including ChatPDF.com. Moreover, the paper emphasizes that this +approach is more efficient compared to the direct processing of full text by +LLMs. The source code is publicly available at: +\url{https://github.com/AkideLiu/ANLP}. + +
+
+ comment: technical report, add code link. arXiv admin note: text overlap with + arXiv:2305.11206 by other authors +
+
+
+
+
+ + ♻ ☆ Exploring Linguistic Style Matching in Online Communities: The Role of + Social Context and Conversation Dynamics + + +
+ Linguistic style matching (LSM) in conversations can be reflective of several +aspects of social influence such as power or persuasion. However, how LSM +relates to the outcomes of online communication on platforms such as Reddit is +an unknown question. In this study, we analyze a large corpus of two-party +conversation threads in Reddit where we identify all occurrences of LSM using +two types of style: the use of function words and formality. Using this +framework, we examine how levels of LSM differ in conversations depending on +several social factors within Reddit: post and subreddit features, conversation +depth, user tenure, and the controversiality of a comment. Finally, we measure +the change of LSM following loss of status after community banning. Our +findings reveal the interplay of LSM in Reddit conversations with several +community metrics, suggesting the importance of understanding conversation +engagement when understanding community dynamics. + +
+
+ comment: Equal contributions from authors 1-9 (AA, HC, JY, KA, JP, AS, LD, MC, + BL) +
+
+
+
+
+ + ♻ ☆ Emoji Prediction in Tweets using BERT + + +
+ In recent years, the use of emojis in social media has increased +dramatically, making them an important element in understanding online +communication. However, predicting the meaning of emojis in a given text is a +challenging task due to their ambiguous nature. In this study, we propose a +transformer-based approach for emoji prediction using BERT, a widely-used +pre-trained language model. We fine-tuned BERT on a large corpus of text +(tweets) containing both text and emojis to predict the most appropriate emoji +for a given text. Our experimental results demonstrate that our approach +outperforms several state-of-the-art models in predicting emojis with an +accuracy of over 75 percent. This work has potential applications in natural +language processing, sentiment analysis, and social media marketing. + +
+
+ comment: This paper is focused on predicting emojis corresponding to tweets + using BERT +
+
+
+
+
+ + ♻ ☆ AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual + Similarity Using Contrastive Learning and Structured Knowledge + + +
+ Generic sentence embeddings provide a coarse-grained approximation of +semantic textual similarity but ignore specific aspects that make texts +similar. Conversely, aspect-based sentence embeddings provide similarities +between texts based on certain predefined aspects. Thus, similarity predictions +of texts are more targeted to specific requirements and more easily +explainable. In this paper, we present AspectCSE, an approach for aspect-based +contrastive learning of sentence embeddings. Results indicate that AspectCSE +achieves an average improvement of 3.97% on information retrieval tasks across +multiple aspects compared to the previous best results. We also propose using +Wikidata knowledge graph properties to train models of multi-aspect sentence +embeddings in which multiple specific aspects are simultaneously considered +during similarity predictions. We demonstrate that multi-aspect embeddings +outperform single-aspect embeddings on aspect-specific information retrieval +tasks. Finally, we examine the aspect-based sentence embedding space and +demonstrate that embeddings of semantically similar aspect labels are often +close, even without explicit similarity training between different aspect +labels. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Multi-View Reasoning: Consistent Contrastive Learning for Math Word + Problem + + +
+ Math word problem solver requires both precise relation reasoning about +quantities in the text and reliable generation for the diverse equation. +Current sequence-to-tree or relation extraction methods regard this only from a +fixed view, struggling to simultaneously handle complex semantics and diverse +equations. However, human solving naturally involves two consistent reasoning +views: top-down and bottom-up, just as math equations also can be expressed in +multiple equivalent forms: pre-order and post-order. We propose a multi-view +consistent contrastive learning for a more complete semantics-to-equation +mapping. The entire process is decoupled into two independent but consistent +views: top-down decomposition and bottom-up construction, and the two reasoning +views are aligned in multi-granularity for consistency, enhancing global +generation and precise reasoning. Experiments on multiple datasets across two +languages show our approach significantly outperforms the existing baselines, +especially on complex problems. We also show after consistent alignment, +multi-view can absorb the merits of both views and generate more diverse +results consistent with the mathematical laws. + +
+
+ comment: 14 pages, 5 figures, 3 appendix figures +
+
+
+
+
+ + ♻ ☆ PGTask: Introducing the Task of Profile Generation from Dialogues SIGDIAL 2023 + + +
+ Recent approaches have attempted to personalize dialogue systems by +leveraging profile information into models. However, this knowledge is scarce +and difficult to obtain, which makes the extraction/generation of profile +information from dialogues a fundamental asset. To surpass this limitation, we +introduce the Profile Generation Task (PGTask). We contribute with a new +dataset for this problem, comprising profile sentences aligned with related +utterances, extracted from a corpus of dialogues. Furthermore, using +state-of-the-art methods, we provide a benchmark for profile generation on this +novel dataset. Our experiments disclose the challenges of profile generation, +and we hope that this introduces a new research direction. + +
+
+ comment: Accepted at SIGDIAL 2023, 4 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Domain Specialization as the Key to Make Large Language Models + Disruptive: A Comprehensive Survey + + +
+ Large language models (LLMs) have significantly advanced the field of natural +language processing (NLP), providing a highly useful, task-agnostic foundation +for a wide range of applications. However, directly applying LLMs to solve +sophisticated problems in specific domains meets many hurdles, caused by the +heterogeneity of domain data, the sophistication of domain knowledge, the +uniqueness of domain objectives, and the diversity of the constraints (e.g., +various social norms, cultural conformity, religious beliefs, and ethical +standards in the domain applications). Domain specification techniques are key +to make large language models disruptive in many applications. Specifically, to +solve these hurdles, there has been a notable increase in research and +practices conducted in recent years on the domain specialization of LLMs. This +emerging field of study, with its substantial potential for impact, +necessitates a comprehensive and systematic review to better summarize and +guide ongoing work in this area. In this article, we present a comprehensive +survey on domain specification techniques for large language models, an +emerging direction critical for large language model applications. First, we +propose a systematic taxonomy that categorizes the LLM domain-specialization +techniques based on the accessibility to LLMs and summarizes the framework for +all the subcategories as well as their relations and differences to each other. +Second, we present an extensive taxonomy of critical application domains that +can benefit dramatically from specialized LLMs, discussing their practical +significance and open challenges. Last, we offer our insights into the current +research status and future trends in this area. + +
+
+
+
+
+ + ♻ ☆ A Survey on Knowledge Graphs for Healthcare: Resources, Applications, + and Promises + + +
+ Healthcare knowledge graphs (HKGs) have emerged as a promising tool for +organizing medical knowledge in a structured and interpretable way, which +provides a comprehensive view of medical concepts and their relationships. +However, challenges such as data heterogeneity and limited coverage remain, +emphasizing the need for further research in the field of HKGs. This survey +paper serves as the first comprehensive overview of HKGs. We summarize the +pipeline and key techniques for HKG construction (i.e., from scratch and +through integration), as well as the common utilization approaches (i.e., +model-free and model-based). To provide researchers with valuable resources, we +organize existing HKGs (The resource is available at +https://github.com/lujiaying/Awesome-HealthCare-KnowledgeBase) based on the +data types they capture and application domains, supplemented with pertinent +statistical information. In the application section, we delve into the +transformative impact of HKGs across various healthcare domains, spanning from +fine-grained basic science research to high-level clinical decision support. +Lastly, we shed light on the opportunities for creating comprehensive and +accurate HKGs in the era of large language models, presenting the potential to +revolutionize healthcare delivery and enhance the interpretability and +reliability of clinical prediction. + +
+
+
+
+
+ + ♻ ☆ Language Model Behavior: A Comprehensive Survey + + +
+ Transformer language models have received widespread public attention, yet +their generated text is often surprising even to NLP researchers. In this +survey, we discuss over 250 recent studies of English language model behavior +before task-specific fine-tuning. Language models possess basic capabilities in +syntax, semantics, pragmatics, world knowledge, and reasoning, but these +capabilities are sensitive to specific inputs and surface features. Despite +dramatic increases in generated text quality as models scale to hundreds of +billions of parameters, the models are still prone to unfactual responses, +commonsense errors, memorized text, and social biases. Many of these weaknesses +can be framed as over-generalizations or under-generalizations of learned +patterns in text. We synthesize recent results to highlight what is currently +known about large language model capabilities, thus providing a resource for +applied work and for research in adjacent fields that use language models. + +
+
+ comment: 32 pages, accepted to Computational Linguistics +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ Video and Audio are Images: A Cross-Modal Mixer for Original Data on + Video-Audio Retrieval + + +
+ Cross-modal retrieval has become popular in recent years, particularly with +the rise of multimedia. Generally, the information from each modality exhibits +distinct representations and semantic information, which makes feature tends to +be in separate latent spaces encoded with dual-tower architecture and makes it +difficult to establish semantic relationships between modalities, resulting in +poor retrieval performance. To address this issue, we propose a novel framework +for cross-modal retrieval which consists of a cross-modal mixer, a masked +autoencoder for pre-training, and a cross-modal retriever for downstream +tasks.In specific, we first adopt cross-modal mixer and mask modeling to fuse +the original modality and eliminate redundancy. Then, an encoder-decoder +architecture is applied to achieve a fuse-then-separate task in the +pre-training phase.We feed masked fused representations into the encoder and +reconstruct them with the decoder, ultimately separating the original data of +two modalities. In downstream tasks, we use the pre-trained encoder to build +the cross-modal retrieval method. Extensive experiments on 2 real-world +datasets show that our approach outperforms previous state-of-the-art methods +in video-audio matching tasks, improving retrieval accuracy by up to 2 times. +Furthermore, we prove our model performance by transferring it to other +downstream tasks as a universal model. + +
+
+
+
+
+ + ☆ Central Similarity Multi-View Hashing for Multimedia Retrieval APWeb + + +
+ Hash representation learning of multi-view heterogeneous data is the key to +improving the accuracy of multimedia retrieval. However, existing methods +utilize local similarity and fall short of deeply fusing the multi-view +features, resulting in poor retrieval accuracy. Current methods only use local +similarity to train their model. These methods ignore global similarity. +Furthermore, most recent works fuse the multi-view features via a weighted sum +or concatenation. We contend that these fusion methods are insufficient for +capturing the interaction between various views. We present a novel Central +Similarity Multi-View Hashing (CSMVH) method to address the mentioned problems. +Central similarity learning is used for solving the local similarity problem, +which can utilize the global similarity between the hash center and samples. We +present copious empirical data demonstrating the superiority of gate-based +fusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed +CSMVH performs better than the state-of-the-art methods by a large margin (up +to 11.41% mean Average Precision (mAP) improvement). + +
+
+ comment: accepted by the Asia Pacific Web (APWeb) and Web-Age Information + Management (WAIM) Joint International Conference on Web and Big Data + (APWeb-WAIM2023) +
+
+
+
+
+ + ☆ How Can Context Help? Exploring Joint Retrieval of Passage and + Personalized Context + + +
+ The integration of external personalized context information into +document-grounded conversational systems has significant potential business +value, but has not been well-studied. Motivated by the concept of personalized +context-aware document-grounded conversational systems, we introduce the task +of context-aware passage retrieval. We also construct a dataset specifically +curated for this purpose. We describe multiple baseline systems to address this +task, and propose a novel approach, Personalized Context-Aware Search (PCAS), +that effectively harnesses contextual information during passage retrieval. +Experimental evaluations conducted on multiple popular dense retrieval systems +demonstrate that our proposed approach not only outperforms the baselines in +retrieving the most relevant passage but also excels at identifying the +pertinent context among all the available contexts. We envision that our +contributions will serve as a catalyst for inspiring future research endeavors +in this promising direction. + +
+
+
+
+
+ + ☆ ZC3: Zero-Shot Cross-Language Code Clone Detection + + +
+ Developers introduce code clones to improve programming productivity. Many +existing studies have achieved impressive performance in monolingual code clone +detection. However, during software development, more and more developers write +semantically equivalent programs with different languages to support different +platforms and help developers translate projects from one language to another. +Considering that collecting cross-language parallel data, especially for +low-resource languages, is expensive and time-consuming, how designing an +effective cross-language model that does not rely on any parallel data is a +significant problem. In this paper, we propose a novel method named ZC3 for +Zero-shot Cross-language Code Clone detection. ZC3 designs the contrastive +snippet prediction to form an isomorphic representation space among different +programming languages. Based on this, ZC3 exploits domain-aware learning and +cycle consistency learning to further constrain the model to generate +representations that are aligned among different languages meanwhile are +diacritical for different types of clones. To evaluate our approach, we conduct +extensive experiments on four representative cross-language clone detection +datasets. Experimental results show that ZC3 outperforms the state-of-the-art +baselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively. +We further investigate the representational distribution of different languages +and discuss the effectiveness of our method. + +
+
+ comment: Accepted by the 38th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2023) +
+
+
+
+
+ + ♻ ☆ MUSE: Music Recommender System with Shuffle Play Recommendation + Enhancement CIKM 2023 + + +
+ Recommender systems have become indispensable in music streaming services, +enhancing user experiences by personalizing playlists and facilitating the +serendipitous discovery of new music. However, the existing recommender systems +overlook the unique challenges inherent in the music domain, specifically +shuffle play, which provides subsequent tracks in a random sequence. Based on +our observation that the shuffle play sessions hinder the overall training +process of music recommender systems mainly due to the high unique transition +rates of shuffle play sessions, we propose a Music Recommender System with +Shuffle Play Recommendation Enhancement (MUSE). MUSE employs the +self-supervised learning framework that maximizes the agreement between the +original session and the augmented session, which is augmented by our novel +session augmentation method, called transition-based augmentation. To further +facilitate the alignment of the representations between the two views, we +devise two fine-grained matching strategies, i.e., item- and similarity-based +matching strategies. Through rigorous experiments conducted across diverse +environments, we demonstrate MUSE's efficacy over 12 baseline models on a +large-scale Music Streaming Sessions Dataset (MSSD) from Spotify. The source +code of MUSE is available at \url{https://github.com/yunhak0/MUSE}. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ The DiffuseStyleGesture+ entry to the GENEA Challenge 2023 + + +
+ In this paper, we introduce the DiffuseStyleGesture+, our solution for the +Generation and Evaluation of Non-verbal Behavior for Embodied Agents (GENEA) +Challenge 2023, which aims to foster the development of realistic, automated +systems for generating conversational gestures. Participants are provided with +a pre-processed dataset and their systems are evaluated through crowdsourced +scoring. Our proposed model, DiffuseStyleGesture+, leverages a diffusion model +to generate gestures automatically. It incorporates a variety of modalities, +including audio, text, speaker ID, and seed gestures. These diverse modalities +are mapped to a hidden space and processed by a modified diffusion model to +produce the corresponding gesture for a given speech input. Upon evaluation, +the DiffuseStyleGesture+ demonstrated performance on par with the top-tier +models in the challenge, showing no significant differences with those models +in human-likeness, appropriateness for the interlocutor, and achieving +competitive performance with the best model on appropriateness for agent +speech. This indicates that our model is competitive and effective in +generating realistic and appropriate gestures for given speech. The code, +pre-trained models, and demos are available at +https://github.com/YoungSeng/DiffuseStyleGesture/tree/DiffuseStyleGesturePlus/BEAT-TWH-main. + +
+
+ comment: 7 pages, 8 figures, ICMI 2023 +
+
+
+
+
+ + ☆ Reinforcement Learning Based Multi-modal Feature Fusion Network for + Novel Class Discovery + + +
+ With the development of deep learning techniques, supervised learning has +achieved performances surpassing those of humans. Researchers have designed +numerous corresponding models for different data modalities, achieving +excellent results in supervised tasks. However, with the exponential increase +of data in multiple fields, the recognition and classification of unlabeled +data have gradually become a hot topic. In this paper, we employed a +Reinforcement Learning framework to simulate the cognitive processes of humans +for effectively addressing novel class discovery in the Open-set domain. We +deployed a Member-to-Leader Multi-Agent framework to extract and fuse features +from multi-modal information, aiming to acquire a more comprehensive +understanding of the feature space. Furthermore, this approach facilitated the +incorporation of self-supervised learning to enhance model training. We +employed a clustering method with varying constraint conditions, ranging from +strict to loose, allowing for the generation of dependable labels for a subset +of unlabeled data during the training phase. This iterative process is similar +to human exploratory learning of unknown data. These mechanisms collectively +update the network parameters based on rewards received from environmental +feedback. This process enables effective control over the extent of exploration +learning, ensuring the accuracy of learning in unknown data categories. We +demonstrate the performance of our approach in both the 3D and 2D domains by +employing the OS-MN40, OS-MN40-Miss, and Cifar10 datasets. Our approach +achieves competitive competitive results. + +
+
+
+
+
+ + ☆ Central Similarity Multi-View Hashing for Multimedia Retrieval APWeb + + +
+ Hash representation learning of multi-view heterogeneous data is the key to +improving the accuracy of multimedia retrieval. However, existing methods +utilize local similarity and fall short of deeply fusing the multi-view +features, resulting in poor retrieval accuracy. Current methods only use local +similarity to train their model. These methods ignore global similarity. +Furthermore, most recent works fuse the multi-view features via a weighted sum +or concatenation. We contend that these fusion methods are insufficient for +capturing the interaction between various views. We present a novel Central +Similarity Multi-View Hashing (CSMVH) method to address the mentioned problems. +Central similarity learning is used for solving the local similarity problem, +which can utilize the global similarity between the hash center and samples. We +present copious empirical data demonstrating the superiority of gate-based +fusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed +CSMVH performs better than the state-of-the-art methods by a large margin (up +to 11.41% mean Average Precision (mAP) improvement). + +
+
+ comment: accepted by the Asia Pacific Web (APWeb) and Web-Age Information + Management (WAIM) Joint International Conference on Web and Big Data + (APWeb-WAIM2023) +
+
+
+
+
+ + ♻ ☆ Towards Top-Down Stereoscopic Image Quality Assessment via Stereo + Attention + + +
+ Stereoscopic image quality assessment (SIQA) plays a crucial role in +evaluating and improving the visual experience of 3D content. Existing +binocular properties and attention-based methods for SIQA have achieved +promising performance. However, these bottom-up approaches are inadequate in +exploiting the inherent characteristics of the human visual system (HVS). This +paper presents a novel network for SIQA via stereo attention, employing a +top-down perspective to guide the quality assessment process. Our proposed +method realizes the guidance from high-level binocular signals down to +low-level monocular signals, while the binocular and monocular information can +be calibrated progressively throughout the processing pipeline. We design a +generalized Stereo AttenTion (SAT) block to implement the top-down philosophy +in stereo perception. This block utilizes the fusion-generated attention map as +a high-level binocular modulator, influencing the representation of two +low-level monocular features. Additionally, we introduce an Energy Coefficient +(EC) to account for recent findings indicating that binocular responses in the +primate primary visual cortex are less than the sum of monocular responses. The +adaptive EC can tune the magnitude of binocular response flexibly, thus +enhancing the formation of robust binocular features within our framework. To +extract the most discriminative quality information from the summation and +subtraction of the two branches of monocular features, we utilize a +dual-pooling strategy that applies min-pooling and max-pooling operations to +the respective branches. Experimental results highlight the superiority of our +top-down method in simulating the property of visual perception and advancing +the state-of-the-art in the SIQA field. The code of this work is available at +https://github.com/Fanning-Zhang/SATNet. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`